]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
Fix for title parsing
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 import re
48
49 from HTMLParser import HTMLParser
50
51 class HTML2Text(HTMLParser):
52     entities = {
53         u'amp': u'&',
54         u'lt': u'<',
55         u'gt': u'>',
56         u'pound': u'£',
57         u'copy': u'©',
58         u'apos': u'\'',
59         u'quot': u'"',
60         u'nbsp': u' ',
61         u'ldquo': u'“',
62         u'rdquo': u'”',
63         u'lsquo': u'‘',
64         u'rsquo': u'’',
65         u'laquo': u'«',
66         u'raquo': u'»',
67         u'lsaquo': u'‹',
68         u'rsaquo': u'›',
69         u'bull': u'•',
70         u'middot': u'·',
71         u'deg': u'°',
72         u'helip': u'…',
73         u'trade': u'™',
74         u'reg': u'®',
75         u'agrave': u'à',
76         u'Agrave': u'À',
77         u'egrave': u'è',
78         u'Egrave': u'È',
79         u'igrave': u'ì',
80         u'Igrave': u'Ì',
81         u'ograve': u'ò',
82         u'Ograve': u'Ò',
83         u'ugrave': u'ù',
84         u'Ugrave': u'Ù',
85         u'aacute': u'á',
86         u'Aacute': u'Á',
87         u'eacute': u'é',
88         u'Eacute': u'É',
89         u'iacute': u'í',
90         u'Iacute': u'Í',
91         u'oacute': u'ó',
92         u'Oacute': u'Ó',
93         u'uacute': u'ú',
94         u'Uacute': u'Ú',
95         u'yactue': u'ý',
96         u'Yacute': u'Ý',
97         u'acirc': u'â',
98         u'Acirc': u'Â',
99         u'ecirc': u'ê',
100         u'Ecirc': u'Ê',
101         u'icirc': u'î',
102         u'Icirc': u'Î',
103         u'ocirc': u'ô',
104         u'Ocirc': u'Ô',
105         u'ucirc': u'û',
106         u'Ucirc': u'Û',
107         u'atilde': u'ã',
108         u'Atilde': u'Ã',
109         u'ntilde': u'ñ',
110         u'Ntilde': u'Ñ',
111         u'otilde': u'õ',
112         u'Otilde': u'Õ',
113         u'auml': u'ä',
114         u'Auml': u'Ä',
115         u'euml': u'ë',
116         u'Euml': u'Ë',
117         u'iuml': u'ï',
118         u'Iuml': u'Ï',
119         u'ouml': u'ö',
120         u'Ouml': u'Ö',
121         u'uuml': u'ü',
122         u'Uuml': u'Ü',
123         u'yuml': u'ÿ',
124         u'Yuml': u'Ÿ',
125         u'iexcl': u'¡',
126         u'iquest': u'¿',
127         u'ccedil': u'ç',
128         u'Ccedil': u'Ç',
129         u'oelig': u'œ',
130         u'OElig': u'Œ',
131         u'szlig': u'ß',
132         u'oslash': u'ø',
133         u'Oslash': u'Ø',
134         u'aring': u'å',
135         u'Aring': u'Å',
136         u'aelig': u'æ',
137         u'AElig': u'Æ',
138         u'thorn': u'þ',
139         u'THORN': u'Þ',
140         u'eth': u'ð',
141         u'ETH': u'Ð',
142         u'mdash': u'—',
143         u'ndash': u'–',
144         u'sect': u'§',
145         u'para': u'¶',
146         u'uarr': u'↑',
147         u'darr': u'↓',
148         u'larr': u'←',
149         u'rarr': u'→',
150         u'dagger': u'†',
151         u'Dagger': u'‡',
152         u'permil': u'‰',
153         u'prod': u'∏',
154         u'infin': u'∞',
155         u'radic': u'√',
156         u'there4': u'∴',
157         u'int': u'∫',
158         u'asymp': u'≈',
159         u'ne': u'≠',
160         u'equiv': '≡',
161         u'le': u'≤',
162         u'ge': u'≥',
163         u'loz': u'⋄',
164         u'sum': u'∑',
165         u'part': u'∂',
166         u'prime': u'′',
167         u'Prime': u'″',
168         u'harr': u'↔',
169         u'micro': u'µ',
170         u'not': u'¬',
171         u'plusmn': u'±',
172         u'divide': u'÷',
173         u'cent': u'¢',
174         u'euro': u'€',
175         }
176
177     blockleveltags = [
178         u'h1',
179         u'h2',
180         u'h3',
181         u'h4',
182         u'h5',
183         u'h6',
184         u'pre',
185         u'p',
186         u'ul',
187         u'ol',
188         u'dl',
189         u'li',
190         u'dt',
191         u'dd',
192         u'div',
193         u'blockquote',
194         ]
195
196     liststarttags = [
197         u'ul',
198         u'ol',
199         u'dl',
200         ]
201
202     cancontainflow = [
203         u'div',
204         u'li',
205         u'dd',
206         u'blockquote',
207     ]
208
209     def __init__(self,textwidth=70):
210         self.text = u''
211         self.curdata = u''
212         self.textwidth = textwidth
213         self.opentags = []
214         self.indentlevel = 0
215         self.ignorenodata = False
216         self.listcount = []
217         self.urls = []
218         self.images = {}
219         HTMLParser.__init__(self)
220
221     def handle_starttag(self, tag, attrs):
222         tag_name = tag.lower()
223         if tag_name in self.blockleveltags:
224             # handle starting a new block - unless we're in a block element
225             # that can contain other blocks, we'll assume that we want to close
226             # the container
227             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
228                 self.handle_curdata()
229
230             if tag_name == u'ol':
231                 self.handle_curdata()
232                 self.listcount.append(1)
233                 self.listlevel = len(self.listcount) - 1
234
235             if tag_name == u'dl':
236                 self.indentlevel = self.indentlevel + 4
237
238             if tag_name in self.liststarttags:
239                 smallist = self.opentags[-3:-1]
240                 smallist.reverse()
241                 for prev_listtag in smallist:
242                     if prev_listtag in [u'dl', u'ol']:
243                         self.indentlevel = self.indentlevel + 4
244                         break
245                     elif prev_listtag == u'ul':
246                         self.indentlevel = self.indentlevel + 3
247                         break
248
249             if len(self.opentags) > 0:
250                 self.handle_curdata()
251                 if tag_name not in self.cancontainflow:
252                     self.opentags.pop()
253             self.opentags.append(tag_name)
254         else:
255             if tag_name == "span":
256                 return
257             listcount = 0
258             try:
259                 listcount = self.listcount[-1]
260             except:
261                 pass
262
263             if tag_name == u'dd' and len(self.opentags) > 1 \
264                 and self.opentags[-1] == u'dt':
265                 self.handle_curdata()
266                 self.opentags.pop()
267             elif tag_name == u'dt' and len(self.opentags) > 1 \
268                 and self.opentags[-1] == u'dd':
269                 self.handle_curdata()
270                 self.opentags.pop()
271             elif tag_name == u'a':
272                 for attr in attrs:
273                     if attr[0].lower() == u'href':
274                         self.urls.append(attr[1].decode('utf-8'))
275                 self.curdata = self.curdata + u'`'
276                 self.opentags.append(tag_name)
277                 return
278             elif tag_name == u'img':
279                 self.handle_image(attrs)
280                 return
281             elif tag_name == u'br':
282                 self.handle_br()
283                 return
284             else:
285                 # we don't know the tag, so lets avoid handling it!
286                 return 
287
288     def handle_startendtag(self, tag, attrs):
289         if tag.lower() == u'br':
290             self.handle_br()
291         elif tag.lower() == u'img':
292             self.handle_image(attrs)
293             return
294
295     def handle_br(self):
296             self.handle_curdata()
297             self.opentags.append(u'br')
298             self.handle_curdata()
299             self.opentags.pop()
300
301     def handle_image(self, attrs):
302         alt = u''
303         url = u''
304         for attr in attrs:
305             if attr[0] == 'alt':
306                 alt = attr[1].decode('utf-8')
307             elif attr[0] == 'src':
308                 url = attr[1].decode('utf-8')
309         if url:
310             if alt:
311                 if self.images.has_key(alt):
312                     if self.images[alt]["url"] == url:
313                         self.curdata = self.curdata \
314                             + u'|%s|' %(alt,)
315                     else:
316                         while self.images.has_key(alt):
317                             alt = alt + "_"
318                         self.images[alt] = {"url": url}
319                         self.curdata = self.curdata \
320                             + u'|%s|' %(alt,)
321                 else:
322                     self.images[alt] = {"url": url}
323                     self.curdata = self.curdata \
324                         + u'|%s|' %(alt,)
325             else:
326                 if self.images.has_key(url):
327                     self.curdata = self.curdata \
328                         + u'|%s|' %(url,)
329                 else:
330                     self.images[url] = {}
331                     self.images[url]["url"] =url
332                     self.curdata = self.curdata \
333                         + u'|%s|' %(url,)
334
335     def handle_curdata(self):
336
337         if len(self.opentags) == 0:
338             return
339
340         tag_thats_done = self.opentags[-1]
341
342         if len(self.curdata) == 0:
343             return
344
345         if tag_thats_done == u'br':
346             if len(self.text) == 0 or self.text[-1] != '\n':
347                 self.text = self.text + '\n'
348                 self.ignorenodata = True
349             return
350
351         if len(self.curdata.strip()) == 0:
352             return
353
354         if tag_thats_done in self.blockleveltags:
355             newlinerequired = self.text != u''
356             if self.ignorenodata:
357                 newlinerequired = False
358             self.ignorenodata = False
359             if newlinerequired:
360                 if tag_thats_done in [u'dt', u'dd', u'li'] \
361                     and len(self.text) > 1 \
362                     and self.text[-1] != u'\n':
363                         self.text = self.text + u'\n'
364                 elif len(self.text) > 2 \
365                     and self.text[-1] != u'\n' \
366                     and self.text[-2] != u'\n':
367                     self.text = self.text + u'\n\n'
368
369         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
370             underline = u''
371             underlinechar = u'='
372             headingtext = " ".join(self.curdata.split())
373             seperator = u'\n' + u' '*self.indentlevel
374             headingtext = seperator.join( \
375                 textwrap.wrap( \
376                     headingtext, \
377                     self.textwidth - self.indentlevel \
378                     ) \
379                 )
380
381             if tag_thats_done == u'h2':
382                 underlinechar = u'-'
383             elif tag_thats_done != u'h1':
384                 underlinechar = u'~'
385
386             if u'\n' in headingtext:
387                 underline = u' ' * self.indentlevel \
388                     + underlinechar * (self.textwidth - self.indentlevel)
389             else:
390                 underline = u' ' * self.indentlevel \
391                     + underlinechar * len(headingtext)
392             self.text = self.text \
393                 + headingtext + u'\n' \
394                 + underline
395         elif tag_thats_done in [u'p', u'div']:
396             paragraph = unicode( \
397                 " ".join(self.curdata.strip().encode("utf-8").split()), \
398                 "utf-8")
399             seperator = u'\n' + u' ' * self.indentlevel
400             self.text = self.text \
401                 + u' ' * self.indentlevel \
402                 + seperator.join( \
403                     textwrap.wrap( \
404                         paragraph, self.textwidth - self.indentlevel))
405         elif tag_thats_done == "pre":
406             self.text = self.text + unicode( \
407                 self.curdata.encode("utf-8"), "utf-8")
408         elif tag_thats_done == u'blockquote':
409             quote = unicode( \
410                 " ".join(self.curdata.encode("utf-8").strip().split()), \
411                 "utf-8")
412             seperator = u'\n' + u' ' * self.indentlevel + u'    '
413             if len(self.text) > 0 and self.text[-1] != u'\n':
414                 self.text = self.text + u'\n'
415             self.text = self.text \
416                 + u'    ' \
417                 + seperator.join( \
418                     textwrap.wrap( \
419                         quote, \
420                         self.textwidth - self.indentlevel - 2 \
421                     )
422                 )
423             self.curdata = u''
424         elif tag_thats_done == "li":
425             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
426             if len(self.text) > 0 and self.text[-1] != u'\n':
427                 self.text = self.text + u'\n'
428             # work out if we're in an ol rather than a ul
429             latesttags = self.opentags[-4:]
430             latesttags.reverse()
431             isul = None
432             for thing in latesttags:
433                 if thing == 'ul':
434                     isul = True
435                     break
436                 elif thing == 'ol':
437                     isul = False
438                     break
439
440             listindent = 3
441             if not isul:
442                 listindent = 4
443
444             listmarker = u' * '
445             if isul == False:
446                 listmarker = u' %2d. ' %(self.listcount[-1])
447                 self.listcount[-1] = self.listcount[-1] + 1
448
449             seperator = u'\n' \
450                 + u' ' * self.indentlevel \
451                 + u' ' * listindent
452             self.text = self.text \
453                 + u' ' * self.indentlevel \
454                 + listmarker \
455                 + seperator.join( \
456                     textwrap.wrap( \
457                         item, \
458                         self.textwidth - self.indentlevel - listindent \
459                     ) \
460                 )
461             self.curdata = u''
462         elif tag_thats_done == u'dt':
463             definition = unicode(" ".join( \
464                     self.curdata.encode("utf-8").strip().split()), \
465                 "utf-8")
466             if len(self.text) > 0 and self.text[-1] != u'\n':
467                 self.text = self.text + u'\n\n'
468             elif len(self.text) > 1 and self.text[-2] != u'\n':
469                 self.text = self.text + u'\n'
470             definition = u' ' * (self.indentlevel - 4) + definition + "::"
471             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
472             self.text = self.text \
473                 + indentstring.join(
474                     textwrap.wrap(definition, \
475                         self.textwidth - self.indentlevel - 4))
476             self.curdata = u''
477         elif tag_thats_done == u'dd':
478             definition = unicode(" ".join( \
479                     self.curdata.encode("utf-8").strip().split()),
480                 "utf-8")
481             if len(definition) > 0:
482                 if len(self.text) > 0 and self.text[-1] != u'\n':
483                     self.text = self.text + u'\n'
484                 indentstring = u'\n' + u' ' * self.indentlevel
485                 self.text = self.text \
486                     + indentstring \
487                     + indentstring.join( \
488                         textwrap.wrap( \
489                             definition, \
490                             self.textwidth - self.indentlevel \
491                             ) \
492                         )
493                 self.curdata = u''
494         elif tag_thats_done == u'a':
495             self.curdata = self.curdata + u'`__'
496             pass
497         elif tag_thats_done in self.liststarttags:
498             pass
499
500         if tag_thats_done in self.blockleveltags:
501             self.curdata = u''
502
503         self.ignorenodata = False
504
505     def handle_endtag(self, tag):
506         self.ignorenodata = False
507         if tag == "span":
508             return
509
510         try:
511             tagindex = self.opentags.index(tag)
512         except:
513             return
514         tag = tag.lower()
515
516         if tag in [u'br', u'img']:
517             return
518
519         if tag == u'dl':
520             self.indentlevel = self.indentlevel - 4
521
522         if tag in self.liststarttags:
523             if tag in [u'ol', u'dl', u'ul', u'dd']:
524                 self.handle_curdata()
525                 # find if there was a previous list level
526                 smalllist = self.opentags[:-1]
527                 smalllist.reverse()
528                 for prev_listtag in smalllist:
529                     if prev_listtag in [u'ol', u'dl']:
530                         self.indentlevel = self.indentlevel - 4
531                         break
532                     elif prev_listtag == u'ul':
533                         self.indentlevel = self.indentlevel - 3
534                         break
535
536         if tag == u'ol':
537             self.listcount = self.listcount[:-1]
538
539         while tagindex < len(self.opentags) \
540             and tag in self.opentags[tagindex+1:]:
541             try:
542                 tagindex = self.opentags.index(tag, tagindex+1)
543             except:
544                 # well, we don't want to do that then
545                 pass
546         if tagindex != len(self.opentags) - 1:
547             # Assuming the data was for the last opened tag first
548             self.handle_curdata()
549             # Now kill the list to be a slice before this tag was opened
550             self.opentags = self.opentags[:tagindex + 1]
551         else:
552             self.handle_curdata()
553             if self.opentags[-1] == tag:
554                 self.opentags.pop()
555
556     def handle_data(self, data):
557         if len(self.opentags) == 0:
558             self.opentags.append(u'p')
559         self.curdata = self.curdata + data.decode("utf-8")
560
561     def handle_charref(self, name):
562         entity = unichr(int(name))
563         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
564             "utf-8")
565
566     def handle_entityref(self, name):
567         entity = name
568         if HTML2Text.entities.has_key(name):
569             entity = HTML2Text.entities[name]
570         else:
571             entity = "&" + name + ";"
572
573         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
574             "utf-8")
575
576     def gettext(self):
577         self.handle_curdata()
578         if len(self.text) == 0 or self.text[-1] != u'\n':
579             self.text = self.text + u'\n'
580         self.opentags = []
581         if len(self.text) > 0:
582             while len(self.text) > 1 and self.text[-1] == u'\n':
583                 self.text = self.text[:-1]
584             self.text = self.text + u'\n'
585         if len(self.urls) > 0:
586             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
587             self.urls = []
588         if len(self.images.keys()) > 0:
589             self.text = self.text + u'\n.. ' \
590                 + u'\n.. '.join( \
591                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
592                 for a in self.images.keys()]) + u'\n'
593             self.images = {}
594         return self.text
595
596 def open_url(method, url):
597     redirectcount = 0
598     while redirectcount < 3:
599         (type, rest) = urllib.splittype(url)
600         (host, path) = urllib.splithost(rest)
601         (host, port) = urllib.splitport(host)
602         if port == None:
603             port = 80
604         try:
605             conn = httplib.HTTPConnection("%s:%s" %(host, port))
606             conn.request(method, path)
607             response = conn.getresponse()
608             if response.status in [301, 302, 303, 307]:
609                 headers = response.getheaders()
610                 for header in headers:
611                     if header[0] == "location":
612                         url = header[1]
613             elif response.status == 200:
614                 return response
615         except:
616             pass
617         redirectcount = redirectcount + 1
618     return None
619
620 def parse_and_deliver(maildir, url, statedir):
621     feedhandle = None
622     headers = None
623     # first check if we know about this feed already
624     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
625     if feeddb.has_key(url):
626         data = feeddb[url]
627         data = cgi.parse_qs(data)
628         response = open_url("HEAD", url)
629         headers = None
630         if response:
631             headers = response.getheaders()
632         ischanged = False
633         try:
634             for header in headers:
635                 if header[0] == "content-length":
636                     if header[1] != data["content-length"][0]:
637                         ischanged = True
638                 elif header[0] == "etag":
639                     if header[1] != data["etag"][0]:
640                         ischanged = True
641                 elif header[0] == "last-modified":
642                     if header[1] != data["last-modified"][0]:
643                         ischanged = True
644                 elif header[0] == "content-md5":
645                     if header[1] != data["content-md5"][0]:
646                         ischanged = True
647         except:
648             ischanged = True
649         if ischanged:
650             response = open_url("GET", url)
651             if response != None:
652                 headers = response.getheaders()
653                 feedhandle = response
654             else:
655                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
656                 return
657         else:
658             return # don't need to do anything, nothings changed.
659     else:
660         response = open_url("GET", url)
661         if response != None:
662             headers = response.getheaders()
663             feedhandle = response
664         else:
665             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
666             return
667
668     fp = feedparser.parse(feedhandle)
669     db = dbm.open(os.path.join(statedir, "seen"), "c")
670     for item in fp["items"]:
671         # have we seen it before?
672         # need to work out what the content is first...
673
674         if item.has_key("content"):
675             content = item["content"][0]["value"]
676         else:
677             content = item["summary"]
678
679         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
680
681         prevmessageid = None
682
683         # check if there's a guid too - if that exists and we match the md5,
684         # return
685         if item.has_key("guid"):
686             if db.has_key(url + "|" + item["guid"]):
687                 data = db[url + "|" + item["guid"]]
688                 data = cgi.parse_qs(data)
689                 if data["contentmd5"][0] == md5sum:
690                     continue
691
692         if db.has_key(url + "|" + item["link"]):
693             data = db[url + "|" + item["link"]]
694             data = cgi.parse_qs(data)
695             if data.has_key("message-id"):
696                 prevmessageid = data["message-id"][0]
697             if data["contentmd5"][0] == md5sum:
698                 continue
699
700         try:
701             author = item["author"]
702         except:
703             author = url
704
705         # create a basic email message
706         msg = MIMEMultipart("alternative")
707         messageid = "<" \
708             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
709             + "." \
710             + "".join( \
711                 [random.choice( \
712                     string.ascii_letters + string.digits \
713                     ) for a in range(0,6) \
714                 ]) + "@" + socket.gethostname() + ">"
715         msg.add_header("Message-ID", messageid)
716         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
717         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
718         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
719         if prevmessageid:
720             msg.add_header("References", prevmessageid)
721         createddate = datetime.datetime.now() \
722             .strftime("%a, %e %b %Y %T -0000")
723         try:
724             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
725                 .strftime("%a, %e %b %Y %T -0000")
726         except:
727             pass
728         msg.add_header("Date", createddate)
729         subj_gen = HTML2Text()
730         title = item["title"].encode("utf-8")
731         title = re.sub(u'<', u'&lt;', title)
732         title = re.sub(u'>', u'&gt;', title)
733         subj_gen.feed(title)
734         msg.add_header("Subject", subj_gen.gettext())
735         msg.set_default_type("text/plain")
736
737         htmlcontent = content.encode("utf-8")
738         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
739             content, \
740             item["link"], \
741             item["link"] )
742         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
743         textparser = HTML2Text()
744         textparser.feed(content.encode("utf-8"))
745         textcontent = textparser.gettext()
746         textcontent = "%s\n\nItem URL: %s" %( \
747             textcontent, \
748             item["link"] )
749         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
750         msg.attach(textpart)
751         msg.attach(htmlpart)
752
753         # start by working out the filename we should be writting to, we do
754         # this following the normal maildir style rules
755         fname = str(os.getpid()) \
756             + "." + socket.gethostname() \
757             + "." + "".join( \
758                 [random.choice( \
759                     string.ascii_letters + string.digits \
760                     ) for a in range(0,10) \
761                 ]) + "." \
762             + datetime.datetime.now().strftime('%s')
763         fn = os.path.join(maildir, "tmp", fname)
764         fh = open(fn, "w")
765         fh.write(msg.as_string())
766         fh.close()
767         # now move it in to the new directory
768         newfn = os.path.join(maildir, "new", fname)
769         os.link(fn, newfn)
770         os.unlink(fn)
771
772         # now add to the database about the item
773         if prevmessageid:
774             messageid = prevmessageid + " " + messageid
775         if item.has_key("guid") and item["guid"] != item["link"]:
776             data = urllib.urlencode(( \
777                 ("message-id", messageid), \
778                 ("created", createddate), \
779                 ("contentmd5", md5sum) \
780                 ))
781             db[url + "|" + item["guid"]] = data
782             try:
783                 data = db[url + "|" + item["link"]]
784                 data = cgi.parse_qs(data)
785                 newdata = urllib.urlencode(( \
786                     ("message-id", messageid), \
787                     ("created", data["created"][0]), \
788                     ("contentmd5", data["contentmd5"][0]) \
789                     ))
790                 db[url + "|" + item["link"]] = newdata
791             except:
792                 db[url + "|" + item["link"]] = data
793         else:
794             data = urllib.urlencode(( \
795                 ("message-id", messageid), \
796                 ("created", createddate), \
797                 ("contentmd5", md5sum) \
798                 ))
799             db[url + "|" + item["link"]] = data
800
801     if headers:
802         data = []
803         for header in headers:
804             if header[0] in \
805                 ["content-md5", "etag", "last-modified", "content-length"]:
806                 data.append((header[0], header[1]))
807         if len(data) > 0:
808             data = urllib.urlencode(data)
809             feeddb[url] = data
810
811     db.close()
812     feeddb.close()
813
814 if __name__ == "__main__":
815     # This only gets executed if we really called the program
816     # first off, parse the command line arguments
817
818     oparser = OptionParser()
819     oparser.add_option(
820         "-c", "--conf", dest="conf",
821         help="location of config file"
822         )
823     oparser.add_option(
824         "-s", "--statedir", dest="statedir",
825         help="location of directory to store state in"
826         )
827
828     (options, args) = oparser.parse_args()
829
830     # check for the configfile
831
832     configfile = None
833
834     if options.conf != None:
835         # does the file exist?
836         try:
837             os.stat(options.conf)
838             configfile = options.conf
839         except:
840             # should exit here as the specified file doesn't exist
841             sys.stderr.write( \
842                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
843             sys.exit(2)
844     else:
845         # check through the default locations
846         try:
847             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
848             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
849         except:
850             try:
851                 os.stat("/etc/rss2maildir.conf")
852                 configfile = "/etc/rss2maildir.conf"
853             except:
854                 sys.stderr.write("No config file found. Exiting.\n")
855                 sys.exit(2)
856
857     # Right - if we've got this far, we've got a config file, now for the hard
858     # bits...
859
860     scp = SafeConfigParser()
861     scp.read(configfile)
862
863     maildir_root = "RSSMaildir"
864     state_dir = "state"
865
866     if options.statedir != None:
867         state_dir = options.statedir
868         try:
869             mode = os.stat(state_dir)[stat.ST_MODE]
870             if not stat.S_ISDIR(mode):
871                 sys.stderr.write( \
872                     "State directory (%s) is not a directory\n" %(state_dir))
873                 sys.exit(1)
874         except:
875             # try to make the directory
876             try:
877                 os.mkdir(state_dir)
878             except:
879                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
880                 sys.exit(1)
881     elif scp.has_option("general", "state_dir"):
882         new_state_dir = scp.get("general", "state_dir")
883         try:
884             mode = os.stat(new_state_dir)[stat.ST_MODE]
885             if not stat.S_ISDIR(mode):
886                 sys.stderr.write( \
887                     "State directory (%s) is not a directory\n" %(state_dir))
888                 sys.exit(1)
889             else:
890                 state_dir = new_state_dir
891         except:
892             # try to create it
893             try:
894                 os.mkdir(new_state_dir)
895                 state_dir = new_state_dir
896             except:
897                 sys.stderr.write( \
898                     "Couldn't create state directory %s\n" %(new_state_dir))
899                 sys.exit(1)
900     else:
901         try:
902             mode = os.stat(state_dir)[stat.ST_MODE]
903             if not stat.S_ISDIR(mode):
904                 sys.stderr.write( \
905                     "State directory %s is not a directory\n" %(state_dir))
906                 sys.exit(1)
907         except:
908             try:
909                 os.mkdir(state_dir)
910             except:
911                 sys.stderr.write( \
912                     "State directory %s could not be created\n" %(state_dir))
913                 sys.exit(1)
914
915     if scp.has_option("general", "maildir_root"):
916         maildir_root = scp.get("general", "maildir_root")
917
918     try:
919         mode = os.stat(maildir_root)[stat.ST_MODE]
920         if not stat.S_ISDIR(mode):
921             sys.stderr.write( \
922                 "Maildir Root %s is not a directory\n" \
923                 %(maildir_root))
924             sys.exit(1)
925     except:
926         try:
927             os.mkdir(maildir_root)
928         except:
929             sys.stderr.write("Couldn't create Maildir Root %s\n" \
930                 %(maildir_root))
931             sys.exit(1)
932
933     feeds = scp.sections()
934     try:
935         feeds.remove("general")
936     except:
937         pass
938
939     for section in feeds:
940         # check if the directory exists
941         maildir = None
942         try:
943             maildir = scp.get(section, "maildir")
944         except:
945             maildir = section
946
947         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
948         maildir = os.path.join(maildir_root, maildir)
949
950         try:
951             exists = os.stat(maildir)
952             if stat.S_ISDIR(exists[stat.ST_MODE]):
953                 # check if there's a new, cur and tmp directory
954                 try:
955                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
956                 except:
957                     os.mkdir(os.path.join(maildir, "cur"))
958                     if not stat.S_ISDIR(mode):
959                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
960                 try:
961                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
962                 except:
963                     os.mkdir(os.path.join(maildir, "tmp"))
964                     if not stat.S_ISDIR(mode):
965                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
966                 try:
967                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
968                     if not stat.S_ISDIR(mode):
969                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
970                 except:
971                     os.mkdir(os.path.join(maildir, "new"))
972             else:
973                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
974         except:
975             try:
976                 os.mkdir(maildir)
977             except:
978                 sys.stderr.write("Couldn't create root maildir %s\n" \
979                     %(maildir))
980                 sys.exit(1)
981             try:
982                 os.mkdir(os.path.join(maildir, "new"))
983                 os.mkdir(os.path.join(maildir, "cur"))
984                 os.mkdir(os.path.join(maildir, "tmp"))
985             except:
986                 sys.stderr.write( \
987                     "Couldn't create required maildir directories for %s\n" \
988                     %(section,))
989                 sys.exit(1)
990
991         # right - we've got the directories, we've got the section, we know the
992         # url... lets play!
993
994         parse_and_deliver(maildir, section, state_dir)