]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
Entity handling fixes
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         }
141
142     blockleveltags = [
143         u'h1',
144         u'h2',
145         u'h3',
146         u'h4',
147         u'h5',
148         u'h6',
149         u'pre',
150         u'p',
151         u'ul',
152         u'ol',
153         u'dl',
154         u'li',
155         u'dt',
156         u'dd',
157         u'div',
158         #u'blockquote',
159         ]
160
161     liststarttags = [
162         u'ul',
163         u'ol',
164         u'dl',
165         ]
166
167     cancontainflow = [
168         u'div',
169         u'li',
170         u'dd',
171         u'blockquote',
172     ]
173
174     def __init__(self,textwidth=70):
175         self.text = u''
176         self.curdata = u''
177         self.textwidth = textwidth
178         self.opentags = []
179         self.indentlevel = 0
180         self.ignorenodata = False
181         self.listcount = []
182         self.urls = []
183         self.images = {}
184         HTMLParser.__init__(self)
185
186     def handle_starttag(self, tag, attrs):
187         tag_name = tag.lower()
188         if tag_name in self.blockleveltags:
189             # handle starting a new block - unless we're in a block element
190             # that can contain other blocks, we'll assume that we want to close
191             # the container
192             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193                 self.handle_curdata()
194
195             if tag_name == u'ol':
196                 self.handle_curdata()
197                 self.listcount.append(1)
198                 self.listlevel = len(self.listcount) - 1
199
200             if tag_name in self.liststarttags:
201                 smallist = self.opentags[-3:-1]
202                 smallist.reverse()
203                 for prev_listtag in smallist:
204                     if prev_listtag in [u'dl', u'ol']:
205                         self.indentlevel = self.indentlevel + 4
206                         break
207                     elif prev_listtag == u'ul':
208                         self.indentlevel = self.indentlevel + 3
209                         break
210
211             if len(self.opentags) > 0:
212                 self.handle_curdata()
213                 if tag_name not in self.cancontainflow:
214                     self.opentags.pop()
215             self.opentags.append(tag_name)
216         else:
217             if tag_name == "span":
218                 return
219             listcount = 0
220             try:
221                 listcount = self.listcount[-1]
222             except:
223                 pass
224
225             if tag_name == u'dd' and len(self.opentags) > 1 \
226                 and self.opentags[-1] == u'dt':
227                 self.handle_curdata()
228                 self.opentags.pop()
229             elif tag_name == u'dt' and len(self.opentags) > 1 \
230                 and self.opentags[-1] == u'dd':
231                 self.handle_curdata()
232                 self.opentags.pop()
233             elif tag_name == u'a':
234                 for attr in attrs:
235                     if attr[0].lower() == u'href':
236                         self.urls.append(attr[1].decode('utf-8'))
237                 self.curdata = self.curdata + u'`'
238                 self.opentags.append(tag_name)
239                 return
240             elif tag_name == u'img':
241                 self.handle_image(attrs)
242                 return
243             elif tag_name == u'br':
244                 self.handle_br()
245                 return
246             else:
247                 # we don't know the tag, so lets avoid handling it!
248                 return 
249
250     def handle_startendtag(self, tag, attrs):
251         if tag.lower() == u'br':
252             self.handle_br()
253         elif tag.lower() == u'img':
254             self.handle_image(attrs)
255             return
256
257     def handle_br(self):
258             self.handle_curdata()
259             self.opentags.append(u'br')
260             self.handle_curdata()
261             self.opentags.pop()
262
263     def handle_image(self, attrs):
264         alt = u''
265         url = u''
266         for attr in attrs:
267             if attr[0] == 'alt':
268                 alt = attr[1].decode('utf-8')
269             elif attr[0] == 'src':
270                 url = attr[1].decode('utf-8')
271         if url:
272             if alt:
273                 if self.images.has_key(alt):
274                     if self.images[alt]["url"] == url:
275                         self.curdata = self.curdata \
276                             + u'|%s|' %(alt,)
277                     else:
278                         while self.images.has_key(alt):
279                             alt = alt + "_"
280                         self.images[alt]["url"] = url
281                         self.curdata = self.curdata \
282                             + u'|%s|' %(alt,)
283                 else:
284                     self.images[alt] = {}
285                     self.images[alt]["url"] = url
286                     self.curdata = self.curdata \
287                         + u'|%s|' %(alt,)
288             else:
289                 if self.images.has_key(url):
290                     self.curdata = self.curdata \
291                         + u'|%s|' %(url,)
292                 else:
293                     self.images[url] = {}
294                     self.images[url]["url"] =url
295                     self.curdata = self.curdata \
296                         + u'|%s|' %(url,)
297
298     def handle_curdata(self):
299
300         if len(self.opentags) == 0:
301             return
302
303         tag_thats_done = self.opentags[-1]
304
305         if len(self.curdata) == 0:
306             return
307
308         if tag_thats_done == u'br':
309             if len(self.text) == 0 or self.text[-1] != '\n':
310                 self.text = self.text + '\n'
311                 self.ignorenodata = True
312             return
313
314         if len(self.curdata.strip()) == 0:
315             return
316
317         if tag_thats_done in self.blockleveltags:
318             newlinerequired = self.text != u''
319             if self.ignorenodata:
320                 newlinerequired = False
321             self.ignorenodata = False
322             if newlinerequired:
323                 if tag_thats_done in [u'dt', u'dd', u'li'] \
324                     and len(self.text) > 1 \
325                     and self.text[-1] != u'\n':
326                         self.text = self.text + u'\n'
327                 elif len(self.text) > 2 \
328                     and self.text[-1] != u'\n' \
329                     and self.text[-2] != u'\n':
330                     self.text = self.text + u'\n\n'
331
332         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
333             underline = u''
334             underlinechar = u'='
335             headingtext = " ".join(self.curdata.split())
336             seperator = u'\n' + u' '*self.indentlevel
337             headingtext = seperator.join( \
338                 textwrap.wrap( \
339                     headingtext, \
340                     self.textwidth - self.indentlevel \
341                     ) \
342                 )
343
344             if tag_thats_done == u'h2':
345                 underlinechar = u'-'
346             elif tag_thats_done != u'h1':
347                 underlinechar = u'~'
348
349             if u'\n' in headingtext:
350                 underline = u' ' * self.indentlevel \
351                     + underlinechar * (self.textwidth - self.indentlevel)
352             else:
353                 underline = u' ' * self.indentlevel \
354                     + underlinechar * len(headingtext)
355             self.text = self.text \
356                 + headingtext + u'\n' \
357                 + underline
358         elif tag_thats_done in [u'p', u'div']:
359             paragraph = unicode( \
360                 " ".join(self.curdata.strip().encode("utf-8").split()), \
361                 "utf-8")
362             seperator = u'\n' + u' ' * self.indentlevel
363             self.text = self.text \
364                 + u' ' * self.indentlevel \
365                 + seperator.join( \
366                     textwrap.wrap( \
367                         paragraph, self.textwidth - self.indentlevel))
368         elif tag_thats_done == "pre":
369             self.text = self.text + unicode( \
370                 self.curdata.encode("utf-8"), "utf-8")
371         elif tag_thats_done == u'blockquote':
372             quote = unicode( \
373                 " ".join(self.curdata.encode("utf-8").strip().split()), \
374                 "utf-8")
375             seperator = u'\n' + u' ' * self.indentlevel + u'> '
376             if len(self.text) > 0 and self.text[-1] != u'\n':
377                 self.text = self.text + u'\n'
378             self.text = self.text \
379                 + u'> ' \
380                 + seperator.join( \
381                     textwrap.wrap( \
382                         quote, \
383                         self.textwidth - self.indentlevel - 2 \
384                     )
385                 )
386             self.curdata = u''
387         elif tag_thats_done == "li":
388             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
389             if len(self.text) > 0 and self.text[-1] != u'\n':
390                 self.text = self.text + u'\n'
391             # work out if we're in an ol rather than a ul
392             latesttags = self.opentags[-4:]
393             latesttags.reverse()
394             isul = None
395             for thing in latesttags:
396                 if thing == 'ul':
397                     isul = True
398                     break
399                 elif thing == 'ol':
400                     isul = False
401                     break
402
403             listindent = 3
404             if not isul:
405                 listindent = 4
406
407             listmarker = u' * '
408             if isul == False:
409                 listmarker = u' %2d. ' %(self.listcount[-1])
410                 self.listcount[-1] = self.listcount[-1] + 1
411
412             seperator = u'\n' \
413                 + u' ' * self.indentlevel \
414                 + u' ' * listindent
415             self.text = self.text \
416                 + u' ' * self.indentlevel \
417                 + listmarker \
418                 + seperator.join( \
419                     textwrap.wrap( \
420                         item, \
421                         self.textwidth - self.indentlevel - listindent \
422                     ) \
423                 )
424             self.curdata = u''
425         elif tag_thats_done == u'dt':
426             definition = unicode(" ".join( \
427                     self.curdata.encode("utf-8").strip().split()), \
428                 "utf-8")
429             if len(self.text) > 0 and self.text[-1] != u'\n':
430                 self.text = self.text + u'\n\n'
431             elif len(self.text) > 1 and self.text[-2] != u'\n':
432                 self.text = self.text + u'\n'
433             definition = u' ' * self.indentlevel + definition + "::"
434             indentstring = u'\n' + u' ' * (self.indentlevel + 1)
435             self.text = self.text \
436                 + indentstring.join(
437                     textwrap.wrap(definition, \
438                         self.textwidth - self.indentlevel - 1))
439             self.curdata = u''
440         elif tag_thats_done == u'dd':
441             definition = unicode(" ".join( \
442                     self.curdata.encode("utf-8").strip().split()),
443                 "utf-8")
444             if len(definition) > 0:
445                 if len(self.text) > 0 and self.text[-1] != u'\n':
446                     self.text = self.text + u'\n'
447                 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
448                 self.text = self.text \
449                     + u' ' * (self.indentlevel + 4) \
450                     + indentstring.join( \
451                         textwrap.wrap( \
452                             definition, \
453                             self.textwidth - self.indentlevel - 4 \
454                             ) \
455                         )
456                 self.curdata = u''
457         elif tag_thats_done == u'a':
458             self.curdata = self.curdata + u'`__'
459             pass
460         elif tag_thats_done in self.liststarttags:
461             pass
462
463         if tag_thats_done in self.blockleveltags:
464             self.curdata = u''
465
466         self.ignorenodata = False
467
468     def handle_endtag(self, tag):
469         self.ignorenodata = False
470         if tag == "span":
471             return
472
473         try:
474             tagindex = self.opentags.index(tag)
475         except:
476             return
477         tag = tag.lower()
478
479         if tag in [u'br', u'img']:
480             return
481
482         if tag in self.liststarttags:
483             if tag in [u'ol', u'dl', u'ul']:
484                 self.handle_curdata()
485                 # find if there was a previous list level
486                 smalllist = self.opentags[:-1]
487                 smalllist.reverse()
488                 for prev_listtag in smalllist:
489                     if prev_listtag in [u'ol', u'dl']:
490                         self.indentlevel = self.indentlevel - 4
491                         break
492                     elif prev_listtag == u'ul':
493                         self.indentlevel = self.indentlevel - 3
494                         break
495
496         if tag == u'ol':
497             self.listcount = self.listcount[:-1]
498
499         while tagindex < len(self.opentags) \
500             and tag in self.opentags[tagindex+1:]:
501             try:
502                 tagindex = self.opentags.index(tag, tagindex+1)
503             except:
504                 # well, we don't want to do that then
505                 pass
506         if tagindex != len(self.opentags) - 1:
507             # Assuming the data was for the last opened tag first
508             self.handle_curdata()
509             # Now kill the list to be a slice before this tag was opened
510             self.opentags = self.opentags[:tagindex + 1]
511         else:
512             self.handle_curdata()
513             if self.opentags[-1] == tag:
514                 self.opentags.pop()
515
516     def handle_data(self, data):
517         if len(self.opentags) == 0:
518             self.opentags.append(u'p')
519         self.curdata = self.curdata + data.decode("utf-8")
520
521     def handle_entityref(self, name):
522         entity = name
523         if HTML2Text.entities.has_key(name):
524             entity = HTML2Text.entities[name]
525         elif name[0] == "#":
526             entity = unichr(int(name[1:]))
527         else:
528             entity = "&" + name + ";"
529
530         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
531             "utf-8")
532
533     def gettext(self):
534         self.handle_curdata()
535         if len(self.text) == 0 or self.text[-1] != u'\n':
536             self.text = self.text + u'\n'
537         self.opentags = []
538         if len(self.text) > 0:
539             while len(self.text) > 1 and self.text[-1] == u'\n':
540                 self.text = self.text[:-1]
541             self.text = self.text + u'\n'
542         if len(self.urls) > 0:
543             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
544             self.urls = []
545         if len(self.images.keys()) > 0:
546             self.text = self.text + u'\n.. ' \
547                 + u'.. '.join( \
548                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
549                 for a in self.images.keys()]) + u'\n'
550             self.images = {}
551         return self.text
552
553 def open_url(method, url):
554     redirectcount = 0
555     while redirectcount < 3:
556         (type, rest) = urllib.splittype(url)
557         (host, path) = urllib.splithost(rest)
558         (host, port) = urllib.splitport(host)
559         if port == None:
560             port = 80
561         try:
562             conn = httplib.HTTPConnection("%s:%s" %(host, port))
563             conn.request(method, path)
564             response = conn.getresponse()
565             if response.status in [301, 302, 303, 307]:
566                 headers = response.getheaders()
567                 for header in headers:
568                     if header[0] == "location":
569                         url = header[1]
570             elif response.status == 200:
571                 return response
572         except:
573             pass
574         redirectcount = redirectcount + 1
575     return None
576
577 def parse_and_deliver(maildir, url, statedir):
578     feedhandle = None
579     headers = None
580     # first check if we know about this feed already
581     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
582     if feeddb.has_key(url):
583         data = feeddb[url]
584         data = cgi.parse_qs(data)
585         response = open_url("HEAD", url)
586         headers = None
587         if response:
588             headers = response.getheaders()
589         ischanged = False
590         try:
591             for header in headers:
592                 if header[0] == "content-length":
593                     if header[1] != data["content-length"][0]:
594                         ischanged = True
595                 elif header[0] == "etag":
596                     if header[1] != data["etag"][0]:
597                         ischanged = True
598                 elif header[0] == "last-modified":
599                     if header[1] != data["last-modified"][0]:
600                         ischanged = True
601                 elif header[0] == "content-md5":
602                     if header[1] != data["content-md5"][0]:
603                         ischanged = True
604         except:
605             ischanged = True
606         if ischanged:
607             response = open_url("GET", url)
608             if response != None:
609                 headers = response.getheaders()
610                 feedhandle = response
611             else:
612                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
613                 return
614         else:
615             return # don't need to do anything, nothings changed.
616     else:
617         response = open_url("GET", url)
618         if response != None:
619             headers = response.getheaders()
620             feedhandle = response
621         else:
622             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
623             return
624
625     fp = feedparser.parse(feedhandle)
626     db = dbm.open(os.path.join(statedir, "seen"), "c")
627     for item in fp["items"]:
628         # have we seen it before?
629         # need to work out what the content is first...
630
631         if item.has_key("content"):
632             content = item["content"][0]["value"]
633         else:
634             content = item["summary"]
635
636         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
637
638         prevmessageid = None
639
640         # check if there's a guid too - if that exists and we match the md5,
641         # return
642         if item.has_key("guid"):
643             if db.has_key(url + "|" + item["guid"]):
644                 data = db[url + "|" + item["guid"]]
645                 data = cgi.parse_qs(data)
646                 if data["contentmd5"][0] == md5sum:
647                     continue
648
649         if db.has_key(url + "|" + item["link"]):
650             data = db[url + "|" + item["link"]]
651             data = cgi.parse_qs(data)
652             if data.has_key("message-id"):
653                 prevmessageid = data["message-id"][0]
654             if data["contentmd5"][0] == md5sum:
655                 continue
656
657         try:
658             author = item["author"]
659         except:
660             author = url
661
662         # create a basic email message
663         msg = MIMEMultipart("alternative")
664         messageid = "<" \
665             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
666             + "." \
667             + "".join( \
668                 [random.choice( \
669                     string.ascii_letters + string.digits \
670                     ) for a in range(0,6) \
671                 ]) + "@" + socket.gethostname() + ">"
672         msg.add_header("Message-ID", messageid)
673         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
674         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
675         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
676         if prevmessageid:
677             msg.add_header("References", prevmessageid)
678         createddate = datetime.datetime.now() \
679             .strftime("%a, %e %b %Y %T -0000")
680         try:
681             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
682                 .strftime("%a, %e %b %Y %T -0000")
683         except:
684             pass
685         msg.add_header("Date", createddate)
686         subj_gen = HTML2Text()
687         subj_gen.feed(item["title"].encod("utf-8"))
688         msg.add_header("Subject", subj_gen.gettext())
689         msg.set_default_type("text/plain")
690
691         htmlcontent = content.encode("utf-8")
692         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
693             content, \
694             item["link"], \
695             item["link"] )
696         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
697         textparser = HTML2Text()
698         textparser.feed(content.encode("utf-8"))
699         textcontent = textparser.gettext()
700         textcontent = "%s\n\nItem URL: %s" %( \
701             textcontent, \
702             item["link"] )
703         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
704         msg.attach(textpart)
705         msg.attach(htmlpart)
706
707         # start by working out the filename we should be writting to, we do
708         # this following the normal maildir style rules
709         fname = str(os.getpid()) \
710             + "." + socket.gethostname() \
711             + "." + "".join( \
712                 [random.choice( \
713                     string.ascii_letters + string.digits \
714                     ) for a in range(0,10) \
715                 ]) + "." \
716             + datetime.datetime.now().strftime('%s')
717         fn = os.path.join(maildir, "tmp", fname)
718         fh = open(fn, "w")
719         fh.write(msg.as_string())
720         fh.close()
721         # now move it in to the new directory
722         newfn = os.path.join(maildir, "new", fname)
723         os.link(fn, newfn)
724         os.unlink(fn)
725
726         # now add to the database about the item
727         if prevmessageid:
728             messageid = prevmessageid + " " + messageid
729         if item.has_key("guid") and item["guid"] != item["link"]:
730             data = urllib.urlencode(( \
731                 ("message-id", messageid), \
732                 ("created", createddate), \
733                 ("contentmd5", md5sum) \
734                 ))
735             db[url + "|" + item["guid"]] = data
736             try:
737                 data = db[url + "|" + item["link"]]
738                 data = cgi.parse_qs(data)
739                 newdata = urllib.urlencode(( \
740                     ("message-id", messageid), \
741                     ("created", data["created"][0]), \
742                     ("contentmd5", data["contentmd5"][0]) \
743                     ))
744                 db[url + "|" + item["link"]] = newdata
745             except:
746                 db[url + "|" + item["link"]] = data
747         else:
748             data = urllib.urlencode(( \
749                 ("message-id", messageid), \
750                 ("created", createddate), \
751                 ("contentmd5", md5sum) \
752                 ))
753             db[url + "|" + item["link"]] = data
754
755     if headers:
756         data = []
757         for header in headers:
758             if header[0] in \
759                 ["content-md5", "etag", "last-modified", "content-length"]:
760                 data.append((header[0], header[1]))
761         if len(data) > 0:
762             data = urllib.urlencode(data)
763             feeddb[url] = data
764
765     db.close()
766     feeddb.close()
767
768 if __name__ == "__main__":
769     # This only gets executed if we really called the program
770     # first off, parse the command line arguments
771
772     oparser = OptionParser()
773     oparser.add_option(
774         "-c", "--conf", dest="conf",
775         help="location of config file"
776         )
777     oparser.add_option(
778         "-s", "--statedir", dest="statedir",
779         help="location of directory to store state in"
780         )
781
782     (options, args) = oparser.parse_args()
783
784     # check for the configfile
785
786     configfile = None
787
788     if options.conf != None:
789         # does the file exist?
790         try:
791             os.stat(options.conf)
792             configfile = options.conf
793         except:
794             # should exit here as the specified file doesn't exist
795             sys.stderr.write( \
796                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
797             sys.exit(2)
798     else:
799         # check through the default locations
800         try:
801             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
802             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
803         except:
804             try:
805                 os.stat("/etc/rss2maildir.conf")
806                 configfile = "/etc/rss2maildir.conf"
807             except:
808                 sys.stderr.write("No config file found. Exiting.\n")
809                 sys.exit(2)
810
811     # Right - if we've got this far, we've got a config file, now for the hard
812     # bits...
813
814     scp = SafeConfigParser()
815     scp.read(configfile)
816
817     maildir_root = "RSSMaildir"
818     state_dir = "state"
819
820     if options.statedir != None:
821         state_dir = options.statedir
822         try:
823             mode = os.stat(state_dir)[stat.ST_MODE]
824             if not stat.S_ISDIR(mode):
825                 sys.stderr.write( \
826                     "State directory (%s) is not a directory\n" %(state_dir))
827                 sys.exit(1)
828         except:
829             # try to make the directory
830             try:
831                 os.mkdir(state_dir)
832             except:
833                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
834                 sys.exit(1)
835     elif scp.has_option("general", "state_dir"):
836         new_state_dir = scp.get("general", "state_dir")
837         try:
838             mode = os.stat(new_state_dir)[stat.ST_MODE]
839             if not stat.S_ISDIR(mode):
840                 sys.stderr.write( \
841                     "State directory (%s) is not a directory\n" %(state_dir))
842                 sys.exit(1)
843             else:
844                 state_dir = new_state_dir
845         except:
846             # try to create it
847             try:
848                 os.mkdir(new_state_dir)
849                 state_dir = new_state_dir
850             except:
851                 sys.stderr.write( \
852                     "Couldn't create state directory %s\n" %(new_state_dir))
853                 sys.exit(1)
854     else:
855         try:
856             mode = os.stat(state_dir)[stat.ST_MODE]
857             if not stat.S_ISDIR(mode):
858                 sys.stderr.write( \
859                     "State directory %s is not a directory\n" %(state_dir))
860                 sys.exit(1)
861         except:
862             try:
863                 os.mkdir(state_dir)
864             except:
865                 sys.stderr.write( \
866                     "State directory %s could not be created\n" %(state_dir))
867                 sys.exit(1)
868
869     if scp.has_option("general", "maildir_root"):
870         maildir_root = scp.get("general", "maildir_root")
871
872     try:
873         mode = os.stat(maildir_root)[stat.ST_MODE]
874         if not stat.S_ISDIR(mode):
875             sys.stderr.write( \
876                 "Maildir Root %s is not a directory\n" \
877                 %(maildir_root))
878             sys.exit(1)
879     except:
880         try:
881             os.mkdir(maildir_root)
882         except:
883             sys.stderr.write("Couldn't create Maildir Root %s\n" \
884                 %(maildir_root))
885             sys.exit(1)
886
887     feeds = scp.sections()
888     try:
889         feeds.remove("general")
890     except:
891         pass
892
893     for section in feeds:
894         # check if the directory exists
895         maildir = None
896         try:
897             maildir = scp.get(section, "maildir")
898         except:
899             maildir = section
900
901         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
902         maildir = os.path.join(maildir_root, maildir)
903
904         try:
905             exists = os.stat(maildir)
906             if stat.S_ISDIR(exists[stat.ST_MODE]):
907                 # check if there's a new, cur and tmp directory
908                 try:
909                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
910                 except:
911                     os.mkdir(os.path.join(maildir, "cur"))
912                     if not stat.S_ISDIR(mode):
913                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
914                 try:
915                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
916                 except:
917                     os.mkdir(os.path.join(maildir, "tmp"))
918                     if not stat.S_ISDIR(mode):
919                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
920                 try:
921                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
922                     if not stat.S_ISDIR(mode):
923                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
924                 except:
925                     os.mkdir(os.path.join(maildir, "new"))
926             else:
927                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
928         except:
929             try:
930                 os.mkdir(maildir)
931             except:
932                 sys.stderr.write("Couldn't create root maildir %s\n" \
933                     %(maildir))
934                 sys.exit(1)
935             try:
936                 os.mkdir(os.path.join(maildir, "new"))
937                 os.mkdir(os.path.join(maildir, "cur"))
938                 os.mkdir(os.path.join(maildir, "tmp"))
939             except:
940                 sys.stderr.write( \
941                     "Couldn't create required maildir directories for %s\n" \
942                     %(section,))
943                 sys.exit(1)
944
945         # right - we've got the directories, we've got the section, we know the
946         # url... lets play!
947
948         parse_and_deliver(maildir, section, state_dir)