]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
6a24b94edc40a119845ea24caad6859c3f7c4b88
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         }
141
142     blockleveltags = [
143         u'h1',
144         u'h2',
145         u'h3',
146         u'h4',
147         u'h5',
148         u'h6',
149         u'pre',
150         u'p',
151         u'ul',
152         u'ol',
153         u'dl',
154         u'li',
155         u'dt',
156         u'dd',
157         u'div',
158         #u'blockquote',
159         ]
160
161     liststarttags = [
162         u'ul',
163         u'ol',
164         u'dl',
165         ]
166
167     cancontainflow = [
168         u'div',
169         u'li',
170         u'dd',
171         u'blockquote',
172     ]
173
174     def __init__(self,textwidth=70):
175         self.text = u''
176         self.curdata = u''
177         self.textwidth = textwidth
178         self.opentags = []
179         self.indentlevel = 0
180         self.ignorenodata = False
181         self.listcount = []
182         self.urls = []
183         self.images = {}
184         HTMLParser.__init__(self)
185
186     def handle_starttag(self, tag, attrs):
187         tag_name = tag.lower()
188         if tag_name in self.blockleveltags:
189             # handle starting a new block - unless we're in a block element
190             # that can contain other blocks, we'll assume that we want to close
191             # the container
192             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193                 self.handle_curdata()
194
195             if tag_name == u'ol':
196                 self.handle_curdata()
197                 self.listcount.append(1)
198                 self.listlevel = len(self.listcount) - 1
199
200             if tag_name == u'dl':
201                 self.indentlevel = self.indentlevel + 4
202
203             if tag_name in self.liststarttags:
204                 smallist = self.opentags[-3:-1]
205                 smallist.reverse()
206                 for prev_listtag in smallist:
207                     if prev_listtag in [u'dl', u'ol']:
208                         self.indentlevel = self.indentlevel + 4
209                         break
210                     elif prev_listtag == u'ul':
211                         self.indentlevel = self.indentlevel + 3
212                         break
213
214             if len(self.opentags) > 0:
215                 self.handle_curdata()
216                 if tag_name not in self.cancontainflow:
217                     self.opentags.pop()
218             self.opentags.append(tag_name)
219         else:
220             if tag_name == "span":
221                 return
222             listcount = 0
223             try:
224                 listcount = self.listcount[-1]
225             except:
226                 pass
227
228             if tag_name == u'dd' and len(self.opentags) > 1 \
229                 and self.opentags[-1] == u'dt':
230                 self.handle_curdata()
231                 self.opentags.pop()
232             elif tag_name == u'dt' and len(self.opentags) > 1 \
233                 and self.opentags[-1] == u'dd':
234                 self.handle_curdata()
235                 self.opentags.pop()
236             elif tag_name == u'a':
237                 for attr in attrs:
238                     if attr[0].lower() == u'href':
239                         self.urls.append(attr[1].decode('utf-8'))
240                 self.curdata = self.curdata + u'`'
241                 self.opentags.append(tag_name)
242                 return
243             elif tag_name == u'img':
244                 self.handle_image(attrs)
245                 return
246             elif tag_name == u'br':
247                 self.handle_br()
248                 return
249             else:
250                 # we don't know the tag, so lets avoid handling it!
251                 return 
252
253     def handle_startendtag(self, tag, attrs):
254         if tag.lower() == u'br':
255             self.handle_br()
256         elif tag.lower() == u'img':
257             self.handle_image(attrs)
258             return
259
260     def handle_br(self):
261             self.handle_curdata()
262             self.opentags.append(u'br')
263             self.handle_curdata()
264             self.opentags.pop()
265
266     def handle_image(self, attrs):
267         alt = u''
268         url = u''
269         for attr in attrs:
270             if attr[0] == 'alt':
271                 alt = attr[1].decode('utf-8')
272             elif attr[0] == 'src':
273                 url = attr[1].decode('utf-8')
274         if url:
275             if alt:
276                 if self.images.has_key(alt):
277                     if self.images[alt]["url"] == url:
278                         self.curdata = self.curdata \
279                             + u'|%s|' %(alt,)
280                     else:
281                         while self.images.has_key(alt):
282                             alt = alt + "_"
283                         self.images[alt]["url"] = url
284                         self.curdata = self.curdata \
285                             + u'|%s|' %(alt,)
286                 else:
287                     self.images[alt] = {}
288                     self.images[alt]["url"] = url
289                     self.curdata = self.curdata \
290                         + u'|%s|' %(alt,)
291             else:
292                 if self.images.has_key(url):
293                     self.curdata = self.curdata \
294                         + u'|%s|' %(url,)
295                 else:
296                     self.images[url] = {}
297                     self.images[url]["url"] =url
298                     self.curdata = self.curdata \
299                         + u'|%s|' %(url,)
300
301     def handle_curdata(self):
302
303         if len(self.opentags) == 0:
304             return
305
306         tag_thats_done = self.opentags[-1]
307
308         if len(self.curdata) == 0:
309             return
310
311         if tag_thats_done == u'br':
312             if len(self.text) == 0 or self.text[-1] != '\n':
313                 self.text = self.text + '\n'
314                 self.ignorenodata = True
315             return
316
317         if len(self.curdata.strip()) == 0:
318             return
319
320         if tag_thats_done in self.blockleveltags:
321             newlinerequired = self.text != u''
322             if self.ignorenodata:
323                 newlinerequired = False
324             self.ignorenodata = False
325             if newlinerequired:
326                 if tag_thats_done in [u'dt', u'dd', u'li'] \
327                     and len(self.text) > 1 \
328                     and self.text[-1] != u'\n':
329                         self.text = self.text + u'\n'
330                 elif len(self.text) > 2 \
331                     and self.text[-1] != u'\n' \
332                     and self.text[-2] != u'\n':
333                     self.text = self.text + u'\n\n'
334
335         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
336             underline = u''
337             underlinechar = u'='
338             headingtext = " ".join(self.curdata.split())
339             seperator = u'\n' + u' '*self.indentlevel
340             headingtext = seperator.join( \
341                 textwrap.wrap( \
342                     headingtext, \
343                     self.textwidth - self.indentlevel \
344                     ) \
345                 )
346
347             if tag_thats_done == u'h2':
348                 underlinechar = u'-'
349             elif tag_thats_done != u'h1':
350                 underlinechar = u'~'
351
352             if u'\n' in headingtext:
353                 underline = u' ' * self.indentlevel \
354                     + underlinechar * (self.textwidth - self.indentlevel)
355             else:
356                 underline = u' ' * self.indentlevel \
357                     + underlinechar * len(headingtext)
358             self.text = self.text \
359                 + headingtext + u'\n' \
360                 + underline
361         elif tag_thats_done in [u'p', u'div']:
362             paragraph = unicode( \
363                 " ".join(self.curdata.strip().encode("utf-8").split()), \
364                 "utf-8")
365             seperator = u'\n' + u' ' * self.indentlevel
366             self.text = self.text \
367                 + u' ' * self.indentlevel \
368                 + seperator.join( \
369                     textwrap.wrap( \
370                         paragraph, self.textwidth - self.indentlevel))
371         elif tag_thats_done == "pre":
372             self.text = self.text + unicode( \
373                 self.curdata.encode("utf-8"), "utf-8")
374         elif tag_thats_done == u'blockquote':
375             quote = unicode( \
376                 " ".join(self.curdata.encode("utf-8").strip().split()), \
377                 "utf-8")
378             seperator = u'\n' + u' ' * self.indentlevel + u'> '
379             if len(self.text) > 0 and self.text[-1] != u'\n':
380                 self.text = self.text + u'\n'
381             self.text = self.text \
382                 + u'> ' \
383                 + seperator.join( \
384                     textwrap.wrap( \
385                         quote, \
386                         self.textwidth - self.indentlevel - 2 \
387                     )
388                 )
389             self.curdata = u''
390         elif tag_thats_done == "li":
391             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
392             if len(self.text) > 0 and self.text[-1] != u'\n':
393                 self.text = self.text + u'\n'
394             # work out if we're in an ol rather than a ul
395             latesttags = self.opentags[-4:]
396             latesttags.reverse()
397             isul = None
398             for thing in latesttags:
399                 if thing == 'ul':
400                     isul = True
401                     break
402                 elif thing == 'ol':
403                     isul = False
404                     break
405
406             listindent = 3
407             if not isul:
408                 listindent = 4
409
410             listmarker = u' * '
411             if isul == False:
412                 listmarker = u' %2d. ' %(self.listcount[-1])
413                 self.listcount[-1] = self.listcount[-1] + 1
414
415             seperator = u'\n' \
416                 + u' ' * self.indentlevel \
417                 + u' ' * listindent
418             self.text = self.text \
419                 + u' ' * self.indentlevel \
420                 + listmarker \
421                 + seperator.join( \
422                     textwrap.wrap( \
423                         item, \
424                         self.textwidth - self.indentlevel - listindent \
425                     ) \
426                 )
427             self.curdata = u''
428         elif tag_thats_done == u'dt':
429             definition = unicode(" ".join( \
430                     self.curdata.encode("utf-8").strip().split()), \
431                 "utf-8")
432             if len(self.text) > 0 and self.text[-1] != u'\n':
433                 self.text = self.text + u'\n\n'
434             elif len(self.text) > 1 and self.text[-2] != u'\n':
435                 self.text = self.text + u'\n'
436             definition = u' ' * (self.indentlevel - 4) + definition + "::"
437             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
438             self.text = self.text \
439                 + indentstring.join(
440                     textwrap.wrap(definition, \
441                         self.textwidth - self.indentlevel - 4))
442             self.curdata = u''
443         elif tag_thats_done == u'dd':
444             definition = unicode(" ".join( \
445                     self.curdata.encode("utf-8").strip().split()),
446                 "utf-8")
447             if len(definition) > 0:
448                 if len(self.text) > 0 and self.text[-1] != u'\n':
449                     self.text = self.text + u'\n'
450                 indentstring = u'\n' + u' ' * self.indentlevel
451                 self.text = self.text \
452                     + indentstring \
453                     + indentstring.join( \
454                         textwrap.wrap( \
455                             definition, \
456                             self.textwidth - self.indentlevel \
457                             ) \
458                         )
459                 self.curdata = u''
460         elif tag_thats_done == u'a':
461             self.curdata = self.curdata + u'`__'
462             pass
463         elif tag_thats_done in self.liststarttags:
464             pass
465
466         if tag_thats_done in self.blockleveltags:
467             self.curdata = u''
468
469         self.ignorenodata = False
470
471     def handle_endtag(self, tag):
472         self.ignorenodata = False
473         if tag == "span":
474             return
475
476         try:
477             tagindex = self.opentags.index(tag)
478         except:
479             return
480         tag = tag.lower()
481
482         if tag in [u'br', u'img']:
483             return
484
485         if tag == u'dl':
486             self.indentlevel = self.indentlevel - 4
487
488         if tag in self.liststarttags:
489             if tag in [u'ol', u'dl', u'ul', u'dd']:
490                 self.handle_curdata()
491                 # find if there was a previous list level
492                 smalllist = self.opentags[:-1]
493                 smalllist.reverse()
494                 for prev_listtag in smalllist:
495                     if prev_listtag in [u'ol', u'dl']:
496                         self.indentlevel = self.indentlevel - 4
497                         break
498                     elif prev_listtag == u'ul':
499                         self.indentlevel = self.indentlevel - 3
500                         break
501
502         if tag == u'ol':
503             self.listcount = self.listcount[:-1]
504
505         while tagindex < len(self.opentags) \
506             and tag in self.opentags[tagindex+1:]:
507             try:
508                 tagindex = self.opentags.index(tag, tagindex+1)
509             except:
510                 # well, we don't want to do that then
511                 pass
512         if tagindex != len(self.opentags) - 1:
513             # Assuming the data was for the last opened tag first
514             self.handle_curdata()
515             # Now kill the list to be a slice before this tag was opened
516             self.opentags = self.opentags[:tagindex + 1]
517         else:
518             self.handle_curdata()
519             if self.opentags[-1] == tag:
520                 self.opentags.pop()
521
522     def handle_data(self, data):
523         if len(self.opentags) == 0:
524             self.opentags.append(u'p')
525         self.curdata = self.curdata + data.decode("utf-8")
526
527     def handle_entityref(self, name):
528         entity = name
529         if HTML2Text.entities.has_key(name):
530             entity = HTML2Text.entities[name]
531         elif name[0] == "#":
532             entity = unichr(int(name[1:]))
533         else:
534             entity = "&" + name + ";"
535
536         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
537             "utf-8")
538
539     def gettext(self):
540         self.handle_curdata()
541         if len(self.text) == 0 or self.text[-1] != u'\n':
542             self.text = self.text + u'\n'
543         self.opentags = []
544         if len(self.text) > 0:
545             while len(self.text) > 1 and self.text[-1] == u'\n':
546                 self.text = self.text[:-1]
547             self.text = self.text + u'\n'
548         if len(self.urls) > 0:
549             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
550             self.urls = []
551         if len(self.images.keys()) > 0:
552             self.text = self.text + u'\n.. ' \
553                 + u'.. '.join( \
554                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
555                 for a in self.images.keys()]) + u'\n'
556             self.images = {}
557         return self.text
558
559 def open_url(method, url):
560     redirectcount = 0
561     while redirectcount < 3:
562         (type, rest) = urllib.splittype(url)
563         (host, path) = urllib.splithost(rest)
564         (host, port) = urllib.splitport(host)
565         if port == None:
566             port = 80
567         try:
568             conn = httplib.HTTPConnection("%s:%s" %(host, port))
569             conn.request(method, path)
570             response = conn.getresponse()
571             if response.status in [301, 302, 303, 307]:
572                 headers = response.getheaders()
573                 for header in headers:
574                     if header[0] == "location":
575                         url = header[1]
576             elif response.status == 200:
577                 return response
578         except:
579             pass
580         redirectcount = redirectcount + 1
581     return None
582
583 def parse_and_deliver(maildir, url, statedir):
584     feedhandle = None
585     headers = None
586     # first check if we know about this feed already
587     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
588     if feeddb.has_key(url):
589         data = feeddb[url]
590         data = cgi.parse_qs(data)
591         response = open_url("HEAD", url)
592         headers = None
593         if response:
594             headers = response.getheaders()
595         ischanged = False
596         try:
597             for header in headers:
598                 if header[0] == "content-length":
599                     if header[1] != data["content-length"][0]:
600                         ischanged = True
601                 elif header[0] == "etag":
602                     if header[1] != data["etag"][0]:
603                         ischanged = True
604                 elif header[0] == "last-modified":
605                     if header[1] != data["last-modified"][0]:
606                         ischanged = True
607                 elif header[0] == "content-md5":
608                     if header[1] != data["content-md5"][0]:
609                         ischanged = True
610         except:
611             ischanged = True
612         if ischanged:
613             response = open_url("GET", url)
614             if response != None:
615                 headers = response.getheaders()
616                 feedhandle = response
617             else:
618                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
619                 return
620         else:
621             return # don't need to do anything, nothings changed.
622     else:
623         response = open_url("GET", url)
624         if response != None:
625             headers = response.getheaders()
626             feedhandle = response
627         else:
628             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
629             return
630
631     fp = feedparser.parse(feedhandle)
632     db = dbm.open(os.path.join(statedir, "seen"), "c")
633     for item in fp["items"]:
634         # have we seen it before?
635         # need to work out what the content is first...
636
637         if item.has_key("content"):
638             content = item["content"][0]["value"]
639         else:
640             content = item["summary"]
641
642         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
643
644         prevmessageid = None
645
646         # check if there's a guid too - if that exists and we match the md5,
647         # return
648         if item.has_key("guid"):
649             if db.has_key(url + "|" + item["guid"]):
650                 data = db[url + "|" + item["guid"]]
651                 data = cgi.parse_qs(data)
652                 if data["contentmd5"][0] == md5sum:
653                     continue
654
655         if db.has_key(url + "|" + item["link"]):
656             data = db[url + "|" + item["link"]]
657             data = cgi.parse_qs(data)
658             if data.has_key("message-id"):
659                 prevmessageid = data["message-id"][0]
660             if data["contentmd5"][0] == md5sum:
661                 continue
662
663         try:
664             author = item["author"]
665         except:
666             author = url
667
668         # create a basic email message
669         msg = MIMEMultipart("alternative")
670         messageid = "<" \
671             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
672             + "." \
673             + "".join( \
674                 [random.choice( \
675                     string.ascii_letters + string.digits \
676                     ) for a in range(0,6) \
677                 ]) + "@" + socket.gethostname() + ">"
678         msg.add_header("Message-ID", messageid)
679         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
680         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
681         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
682         if prevmessageid:
683             msg.add_header("References", prevmessageid)
684         createddate = datetime.datetime.now() \
685             .strftime("%a, %e %b %Y %T -0000")
686         try:
687             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
688                 .strftime("%a, %e %b %Y %T -0000")
689         except:
690             pass
691         msg.add_header("Date", createddate)
692         subj_gen = HTML2Text()
693         subj_gen.feed(item["title"].encode("utf-8"))
694         msg.add_header("Subject", subj_gen.gettext())
695         msg.set_default_type("text/plain")
696
697         htmlcontent = content.encode("utf-8")
698         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
699             content, \
700             item["link"], \
701             item["link"] )
702         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
703         textparser = HTML2Text()
704         textparser.feed(content.encode("utf-8"))
705         textcontent = textparser.gettext()
706         textcontent = "%s\n\nItem URL: %s" %( \
707             textcontent, \
708             item["link"] )
709         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
710         msg.attach(textpart)
711         msg.attach(htmlpart)
712
713         # start by working out the filename we should be writting to, we do
714         # this following the normal maildir style rules
715         fname = str(os.getpid()) \
716             + "." + socket.gethostname() \
717             + "." + "".join( \
718                 [random.choice( \
719                     string.ascii_letters + string.digits \
720                     ) for a in range(0,10) \
721                 ]) + "." \
722             + datetime.datetime.now().strftime('%s')
723         fn = os.path.join(maildir, "tmp", fname)
724         fh = open(fn, "w")
725         fh.write(msg.as_string())
726         fh.close()
727         # now move it in to the new directory
728         newfn = os.path.join(maildir, "new", fname)
729         os.link(fn, newfn)
730         os.unlink(fn)
731
732         # now add to the database about the item
733         if prevmessageid:
734             messageid = prevmessageid + " " + messageid
735         if item.has_key("guid") and item["guid"] != item["link"]:
736             data = urllib.urlencode(( \
737                 ("message-id", messageid), \
738                 ("created", createddate), \
739                 ("contentmd5", md5sum) \
740                 ))
741             db[url + "|" + item["guid"]] = data
742             try:
743                 data = db[url + "|" + item["link"]]
744                 data = cgi.parse_qs(data)
745                 newdata = urllib.urlencode(( \
746                     ("message-id", messageid), \
747                     ("created", data["created"][0]), \
748                     ("contentmd5", data["contentmd5"][0]) \
749                     ))
750                 db[url + "|" + item["link"]] = newdata
751             except:
752                 db[url + "|" + item["link"]] = data
753         else:
754             data = urllib.urlencode(( \
755                 ("message-id", messageid), \
756                 ("created", createddate), \
757                 ("contentmd5", md5sum) \
758                 ))
759             db[url + "|" + item["link"]] = data
760
761     if headers:
762         data = []
763         for header in headers:
764             if header[0] in \
765                 ["content-md5", "etag", "last-modified", "content-length"]:
766                 data.append((header[0], header[1]))
767         if len(data) > 0:
768             data = urllib.urlencode(data)
769             feeddb[url] = data
770
771     db.close()
772     feeddb.close()
773
774 if __name__ == "__main__":
775     # This only gets executed if we really called the program
776     # first off, parse the command line arguments
777
778     oparser = OptionParser()
779     oparser.add_option(
780         "-c", "--conf", dest="conf",
781         help="location of config file"
782         )
783     oparser.add_option(
784         "-s", "--statedir", dest="statedir",
785         help="location of directory to store state in"
786         )
787
788     (options, args) = oparser.parse_args()
789
790     # check for the configfile
791
792     configfile = None
793
794     if options.conf != None:
795         # does the file exist?
796         try:
797             os.stat(options.conf)
798             configfile = options.conf
799         except:
800             # should exit here as the specified file doesn't exist
801             sys.stderr.write( \
802                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
803             sys.exit(2)
804     else:
805         # check through the default locations
806         try:
807             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
808             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
809         except:
810             try:
811                 os.stat("/etc/rss2maildir.conf")
812                 configfile = "/etc/rss2maildir.conf"
813             except:
814                 sys.stderr.write("No config file found. Exiting.\n")
815                 sys.exit(2)
816
817     # Right - if we've got this far, we've got a config file, now for the hard
818     # bits...
819
820     scp = SafeConfigParser()
821     scp.read(configfile)
822
823     maildir_root = "RSSMaildir"
824     state_dir = "state"
825
826     if options.statedir != None:
827         state_dir = options.statedir
828         try:
829             mode = os.stat(state_dir)[stat.ST_MODE]
830             if not stat.S_ISDIR(mode):
831                 sys.stderr.write( \
832                     "State directory (%s) is not a directory\n" %(state_dir))
833                 sys.exit(1)
834         except:
835             # try to make the directory
836             try:
837                 os.mkdir(state_dir)
838             except:
839                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
840                 sys.exit(1)
841     elif scp.has_option("general", "state_dir"):
842         new_state_dir = scp.get("general", "state_dir")
843         try:
844             mode = os.stat(new_state_dir)[stat.ST_MODE]
845             if not stat.S_ISDIR(mode):
846                 sys.stderr.write( \
847                     "State directory (%s) is not a directory\n" %(state_dir))
848                 sys.exit(1)
849             else:
850                 state_dir = new_state_dir
851         except:
852             # try to create it
853             try:
854                 os.mkdir(new_state_dir)
855                 state_dir = new_state_dir
856             except:
857                 sys.stderr.write( \
858                     "Couldn't create state directory %s\n" %(new_state_dir))
859                 sys.exit(1)
860     else:
861         try:
862             mode = os.stat(state_dir)[stat.ST_MODE]
863             if not stat.S_ISDIR(mode):
864                 sys.stderr.write( \
865                     "State directory %s is not a directory\n" %(state_dir))
866                 sys.exit(1)
867         except:
868             try:
869                 os.mkdir(state_dir)
870             except:
871                 sys.stderr.write( \
872                     "State directory %s could not be created\n" %(state_dir))
873                 sys.exit(1)
874
875     if scp.has_option("general", "maildir_root"):
876         maildir_root = scp.get("general", "maildir_root")
877
878     try:
879         mode = os.stat(maildir_root)[stat.ST_MODE]
880         if not stat.S_ISDIR(mode):
881             sys.stderr.write( \
882                 "Maildir Root %s is not a directory\n" \
883                 %(maildir_root))
884             sys.exit(1)
885     except:
886         try:
887             os.mkdir(maildir_root)
888         except:
889             sys.stderr.write("Couldn't create Maildir Root %s\n" \
890                 %(maildir_root))
891             sys.exit(1)
892
893     feeds = scp.sections()
894     try:
895         feeds.remove("general")
896     except:
897         pass
898
899     for section in feeds:
900         # check if the directory exists
901         maildir = None
902         try:
903             maildir = scp.get(section, "maildir")
904         except:
905             maildir = section
906
907         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
908         maildir = os.path.join(maildir_root, maildir)
909
910         try:
911             exists = os.stat(maildir)
912             if stat.S_ISDIR(exists[stat.ST_MODE]):
913                 # check if there's a new, cur and tmp directory
914                 try:
915                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
916                 except:
917                     os.mkdir(os.path.join(maildir, "cur"))
918                     if not stat.S_ISDIR(mode):
919                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
920                 try:
921                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
922                 except:
923                     os.mkdir(os.path.join(maildir, "tmp"))
924                     if not stat.S_ISDIR(mode):
925                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
926                 try:
927                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
928                     if not stat.S_ISDIR(mode):
929                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
930                 except:
931                     os.mkdir(os.path.join(maildir, "new"))
932             else:
933                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
934         except:
935             try:
936                 os.mkdir(maildir)
937             except:
938                 sys.stderr.write("Couldn't create root maildir %s\n" \
939                     %(maildir))
940                 sys.exit(1)
941             try:
942                 os.mkdir(os.path.join(maildir, "new"))
943                 os.mkdir(os.path.join(maildir, "cur"))
944                 os.mkdir(os.path.join(maildir, "tmp"))
945             except:
946                 sys.stderr.write( \
947                     "Couldn't create required maildir directories for %s\n" \
948                     %(section,))
949                 sys.exit(1)
950
951         # right - we've got the directories, we've got the section, we know the
952         # url... lets play!
953
954         parse_and_deliver(maildir, section, state_dir)