Fix blockquote support
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         u'mdash': u'—',
141         u'ndash': u'–',
142         u'sect': u'§',
143         u'para': u'¶',
144         u'uarr': u'↑',
145         u'darr': u'↓',
146         u'larr': u'←',
147         u'rarr': u'→',
148         u'dagger': u'†',
149         u'Dagger': u'‡',
150         u'permil': u'‰',
151         u'prod': u'∏',
152         u'infin': u'∞',
153         u'radic': u'√',
154         u'there4': u'∴',
155         u'int': u'∫',
156         u'asymp': u'≈',
157         u'ne': u'≠',
158         u'equiv': '≡',
159         u'le': u'≤',
160         u'ge': u'≥',
161         u'loz': u'⋄',
162         u'sum': u'∑',
163         u'part': u'∂',
164         u'prime': u'′',
165         u'Prime': u'″',
166         u'harr': u'↔',
167         u'micro': u'µ',
168         u'not': u'¬',
169         u'plusmn': u'±',
170         u'divide': u'÷',
171         u'cent': u'¢',
172         u'euro': u'€',
173         }
174
175     blockleveltags = [
176         u'h1',
177         u'h2',
178         u'h3',
179         u'h4',
180         u'h5',
181         u'h6',
182         u'pre',
183         u'p',
184         u'ul',
185         u'ol',
186         u'dl',
187         u'li',
188         u'dt',
189         u'dd',
190         u'div',
191         u'blockquote',
192         ]
193
194     liststarttags = [
195         u'ul',
196         u'ol',
197         u'dl',
198         ]
199
200     cancontainflow = [
201         u'div',
202         u'li',
203         u'dd',
204         u'blockquote',
205     ]
206
207     def __init__(self,textwidth=70):
208         self.text = u''
209         self.curdata = u''
210         self.textwidth = textwidth
211         self.opentags = []
212         self.indentlevel = 0
213         self.ignorenodata = False
214         self.listcount = []
215         self.urls = []
216         self.images = {}
217         HTMLParser.__init__(self)
218
219     def handle_starttag(self, tag, attrs):
220         tag_name = tag.lower()
221         if tag_name in self.blockleveltags:
222             # handle starting a new block - unless we're in a block element
223             # that can contain other blocks, we'll assume that we want to close
224             # the container
225             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226                 self.handle_curdata()
227
228             if tag_name == u'ol':
229                 self.handle_curdata()
230                 self.listcount.append(1)
231                 self.listlevel = len(self.listcount) - 1
232
233             if tag_name == u'dl':
234                 self.indentlevel = self.indentlevel + 4
235
236             if tag_name in self.liststarttags:
237                 smallist = self.opentags[-3:-1]
238                 smallist.reverse()
239                 for prev_listtag in smallist:
240                     if prev_listtag in [u'dl', u'ol']:
241                         self.indentlevel = self.indentlevel + 4
242                         break
243                     elif prev_listtag == u'ul':
244                         self.indentlevel = self.indentlevel + 3
245                         break
246
247             if len(self.opentags) > 0:
248                 self.handle_curdata()
249                 if tag_name not in self.cancontainflow:
250                     self.opentags.pop()
251             self.opentags.append(tag_name)
252         else:
253             if tag_name == "span":
254                 return
255             listcount = 0
256             try:
257                 listcount = self.listcount[-1]
258             except:
259                 pass
260
261             if tag_name == u'dd' and len(self.opentags) > 1 \
262                 and self.opentags[-1] == u'dt':
263                 self.handle_curdata()
264                 self.opentags.pop()
265             elif tag_name == u'dt' and len(self.opentags) > 1 \
266                 and self.opentags[-1] == u'dd':
267                 self.handle_curdata()
268                 self.opentags.pop()
269             elif tag_name == u'a':
270                 for attr in attrs:
271                     if attr[0].lower() == u'href':
272                         self.urls.append(attr[1].decode('utf-8'))
273                 self.curdata = self.curdata + u'`'
274                 self.opentags.append(tag_name)
275                 return
276             elif tag_name == u'img':
277                 self.handle_image(attrs)
278                 return
279             elif tag_name == u'br':
280                 self.handle_br()
281                 return
282             else:
283                 # we don't know the tag, so lets avoid handling it!
284                 return 
285
286     def handle_startendtag(self, tag, attrs):
287         if tag.lower() == u'br':
288             self.handle_br()
289         elif tag.lower() == u'img':
290             self.handle_image(attrs)
291             return
292
293     def handle_br(self):
294             self.handle_curdata()
295             self.opentags.append(u'br')
296             self.handle_curdata()
297             self.opentags.pop()
298
299     def handle_image(self, attrs):
300         alt = u''
301         url = u''
302         for attr in attrs:
303             if attr[0] == 'alt':
304                 alt = attr[1].decode('utf-8')
305             elif attr[0] == 'src':
306                 url = attr[1].decode('utf-8')
307         if url:
308             if alt:
309                 if self.images.has_key(alt):
310                     if self.images[alt]["url"] == url:
311                         self.curdata = self.curdata \
312                             + u'|%s|' %(alt,)
313                     else:
314                         while self.images.has_key(alt):
315                             alt = alt + "_"
316                         self.images[alt] = {"url": url}
317                         self.curdata = self.curdata \
318                             + u'|%s|' %(alt,)
319                 else:
320                     self.images[alt] = {"url": url}
321                     self.curdata = self.curdata \
322                         + u'|%s|' %(alt,)
323             else:
324                 if self.images.has_key(url):
325                     self.curdata = self.curdata \
326                         + u'|%s|' %(url,)
327                 else:
328                     self.images[url] = {}
329                     self.images[url]["url"] =url
330                     self.curdata = self.curdata \
331                         + u'|%s|' %(url,)
332
333     def handle_curdata(self):
334
335         if len(self.opentags) == 0:
336             return
337
338         tag_thats_done = self.opentags[-1]
339
340         if len(self.curdata) == 0:
341             return
342
343         if tag_thats_done == u'br':
344             if len(self.text) == 0 or self.text[-1] != '\n':
345                 self.text = self.text + '\n'
346                 self.ignorenodata = True
347             return
348
349         if len(self.curdata.strip()) == 0:
350             return
351
352         if tag_thats_done in self.blockleveltags:
353             newlinerequired = self.text != u''
354             if self.ignorenodata:
355                 newlinerequired = False
356             self.ignorenodata = False
357             if newlinerequired:
358                 if tag_thats_done in [u'dt', u'dd', u'li'] \
359                     and len(self.text) > 1 \
360                     and self.text[-1] != u'\n':
361                         self.text = self.text + u'\n'
362                 elif len(self.text) > 2 \
363                     and self.text[-1] != u'\n' \
364                     and self.text[-2] != u'\n':
365                     self.text = self.text + u'\n\n'
366
367         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
368             underline = u''
369             underlinechar = u'='
370             headingtext = " ".join(self.curdata.split())
371             seperator = u'\n' + u' '*self.indentlevel
372             headingtext = seperator.join( \
373                 textwrap.wrap( \
374                     headingtext, \
375                     self.textwidth - self.indentlevel \
376                     ) \
377                 )
378
379             if tag_thats_done == u'h2':
380                 underlinechar = u'-'
381             elif tag_thats_done != u'h1':
382                 underlinechar = u'~'
383
384             if u'\n' in headingtext:
385                 underline = u' ' * self.indentlevel \
386                     + underlinechar * (self.textwidth - self.indentlevel)
387             else:
388                 underline = u' ' * self.indentlevel \
389                     + underlinechar * len(headingtext)
390             self.text = self.text \
391                 + headingtext + u'\n' \
392                 + underline
393         elif tag_thats_done in [u'p', u'div']:
394             paragraph = unicode( \
395                 " ".join(self.curdata.strip().encode("utf-8").split()), \
396                 "utf-8")
397             seperator = u'\n' + u' ' * self.indentlevel
398             self.text = self.text \
399                 + u' ' * self.indentlevel \
400                 + seperator.join( \
401                     textwrap.wrap( \
402                         paragraph, self.textwidth - self.indentlevel))
403         elif tag_thats_done == "pre":
404             self.text = self.text + unicode( \
405                 self.curdata.encode("utf-8"), "utf-8")
406         elif tag_thats_done == u'blockquote':
407             quote = unicode( \
408                 " ".join(self.curdata.encode("utf-8").strip().split()), \
409                 "utf-8")
410             seperator = u'\n' + u' ' * self.indentlevel + u'> '
411             if len(self.text) > 0 and self.text[-1] != u'\n':
412                 self.text = self.text + u'\n'
413             self.text = self.text \
414                 + u'    ' \
415                 + seperator.join( \
416                     textwrap.wrap( \
417                         quote, \
418                         self.textwidth - self.indentlevel - 2 \
419                     )
420                 )
421             self.curdata = u''
422         elif tag_thats_done == "li":
423             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
424             if len(self.text) > 0 and self.text[-1] != u'\n':
425                 self.text = self.text + u'\n'
426             # work out if we're in an ol rather than a ul
427             latesttags = self.opentags[-4:]
428             latesttags.reverse()
429             isul = None
430             for thing in latesttags:
431                 if thing == 'ul':
432                     isul = True
433                     break
434                 elif thing == 'ol':
435                     isul = False
436                     break
437
438             listindent = 3
439             if not isul:
440                 listindent = 4
441
442             listmarker = u' * '
443             if isul == False:
444                 listmarker = u' %2d. ' %(self.listcount[-1])
445                 self.listcount[-1] = self.listcount[-1] + 1
446
447             seperator = u'\n' \
448                 + u' ' * self.indentlevel \
449                 + u' ' * listindent
450             self.text = self.text \
451                 + u' ' * self.indentlevel \
452                 + listmarker \
453                 + seperator.join( \
454                     textwrap.wrap( \
455                         item, \
456                         self.textwidth - self.indentlevel - listindent \
457                     ) \
458                 )
459             self.curdata = u''
460         elif tag_thats_done == u'dt':
461             definition = unicode(" ".join( \
462                     self.curdata.encode("utf-8").strip().split()), \
463                 "utf-8")
464             if len(self.text) > 0 and self.text[-1] != u'\n':
465                 self.text = self.text + u'\n\n'
466             elif len(self.text) > 1 and self.text[-2] != u'\n':
467                 self.text = self.text + u'\n'
468             definition = u' ' * (self.indentlevel - 4) + definition + "::"
469             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
470             self.text = self.text \
471                 + indentstring.join(
472                     textwrap.wrap(definition, \
473                         self.textwidth - self.indentlevel - 4))
474             self.curdata = u''
475         elif tag_thats_done == u'dd':
476             definition = unicode(" ".join( \
477                     self.curdata.encode("utf-8").strip().split()),
478                 "utf-8")
479             if len(definition) > 0:
480                 if len(self.text) > 0 and self.text[-1] != u'\n':
481                     self.text = self.text + u'\n'
482                 indentstring = u'\n' + u' ' * self.indentlevel
483                 self.text = self.text \
484                     + indentstring \
485                     + indentstring.join( \
486                         textwrap.wrap( \
487                             definition, \
488                             self.textwidth - self.indentlevel \
489                             ) \
490                         )
491                 self.curdata = u''
492         elif tag_thats_done == u'a':
493             self.curdata = self.curdata + u'`__'
494             pass
495         elif tag_thats_done in self.liststarttags:
496             pass
497
498         if tag_thats_done in self.blockleveltags:
499             self.curdata = u''
500
501         self.ignorenodata = False
502
503     def handle_endtag(self, tag):
504         self.ignorenodata = False
505         if tag == "span":
506             return
507
508         try:
509             tagindex = self.opentags.index(tag)
510         except:
511             return
512         tag = tag.lower()
513
514         if tag in [u'br', u'img']:
515             return
516
517         if tag == u'dl':
518             self.indentlevel = self.indentlevel - 4
519
520         if tag in self.liststarttags:
521             if tag in [u'ol', u'dl', u'ul', u'dd']:
522                 self.handle_curdata()
523                 # find if there was a previous list level
524                 smalllist = self.opentags[:-1]
525                 smalllist.reverse()
526                 for prev_listtag in smalllist:
527                     if prev_listtag in [u'ol', u'dl']:
528                         self.indentlevel = self.indentlevel - 4
529                         break
530                     elif prev_listtag == u'ul':
531                         self.indentlevel = self.indentlevel - 3
532                         break
533
534         if tag == u'ol':
535             self.listcount = self.listcount[:-1]
536
537         while tagindex < len(self.opentags) \
538             and tag in self.opentags[tagindex+1:]:
539             try:
540                 tagindex = self.opentags.index(tag, tagindex+1)
541             except:
542                 # well, we don't want to do that then
543                 pass
544         if tagindex != len(self.opentags) - 1:
545             # Assuming the data was for the last opened tag first
546             self.handle_curdata()
547             # Now kill the list to be a slice before this tag was opened
548             self.opentags = self.opentags[:tagindex + 1]
549         else:
550             self.handle_curdata()
551             if self.opentags[-1] == tag:
552                 self.opentags.pop()
553
554     def handle_data(self, data):
555         if len(self.opentags) == 0:
556             self.opentags.append(u'p')
557         self.curdata = self.curdata + data.decode("utf-8")
558
559     def handle_entityref(self, name):
560         entity = name
561         if HTML2Text.entities.has_key(name):
562             entity = HTML2Text.entities[name]
563         elif name[0] == "#":
564             entity = unichr(int(name[1:]))
565         else:
566             entity = "&" + name + ";"
567
568         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
569             "utf-8")
570
571     def gettext(self):
572         self.handle_curdata()
573         if len(self.text) == 0 or self.text[-1] != u'\n':
574             self.text = self.text + u'\n'
575         self.opentags = []
576         if len(self.text) > 0:
577             while len(self.text) > 1 and self.text[-1] == u'\n':
578                 self.text = self.text[:-1]
579             self.text = self.text + u'\n'
580         if len(self.urls) > 0:
581             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
582             self.urls = []
583         if len(self.images.keys()) > 0:
584             self.text = self.text + u'\n.. ' \
585                 + u'\n.. '.join( \
586                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
587                 for a in self.images.keys()]) + u'\n'
588             self.images = {}
589         return self.text
590
591 def open_url(method, url):
592     redirectcount = 0
593     while redirectcount < 3:
594         (type, rest) = urllib.splittype(url)
595         (host, path) = urllib.splithost(rest)
596         (host, port) = urllib.splitport(host)
597         if port == None:
598             port = 80
599         try:
600             conn = httplib.HTTPConnection("%s:%s" %(host, port))
601             conn.request(method, path)
602             response = conn.getresponse()
603             if response.status in [301, 302, 303, 307]:
604                 headers = response.getheaders()
605                 for header in headers:
606                     if header[0] == "location":
607                         url = header[1]
608             elif response.status == 200:
609                 return response
610         except:
611             pass
612         redirectcount = redirectcount + 1
613     return None
614
615 def parse_and_deliver(maildir, url, statedir):
616     feedhandle = None
617     headers = None
618     # first check if we know about this feed already
619     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
620     if feeddb.has_key(url):
621         data = feeddb[url]
622         data = cgi.parse_qs(data)
623         response = open_url("HEAD", url)
624         headers = None
625         if response:
626             headers = response.getheaders()
627         ischanged = False
628         try:
629             for header in headers:
630                 if header[0] == "content-length":
631                     if header[1] != data["content-length"][0]:
632                         ischanged = True
633                 elif header[0] == "etag":
634                     if header[1] != data["etag"][0]:
635                         ischanged = True
636                 elif header[0] == "last-modified":
637                     if header[1] != data["last-modified"][0]:
638                         ischanged = True
639                 elif header[0] == "content-md5":
640                     if header[1] != data["content-md5"][0]:
641                         ischanged = True
642         except:
643             ischanged = True
644         if ischanged:
645             response = open_url("GET", url)
646             if response != None:
647                 headers = response.getheaders()
648                 feedhandle = response
649             else:
650                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
651                 return
652         else:
653             return # don't need to do anything, nothings changed.
654     else:
655         response = open_url("GET", url)
656         if response != None:
657             headers = response.getheaders()
658             feedhandle = response
659         else:
660             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
661             return
662
663     fp = feedparser.parse(feedhandle)
664     db = dbm.open(os.path.join(statedir, "seen"), "c")
665     for item in fp["items"]:
666         # have we seen it before?
667         # need to work out what the content is first...
668
669         if item.has_key("content"):
670             content = item["content"][0]["value"]
671         else:
672             content = item["summary"]
673
674         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
675
676         prevmessageid = None
677
678         # check if there's a guid too - if that exists and we match the md5,
679         # return
680         if item.has_key("guid"):
681             if db.has_key(url + "|" + item["guid"]):
682                 data = db[url + "|" + item["guid"]]
683                 data = cgi.parse_qs(data)
684                 if data["contentmd5"][0] == md5sum:
685                     continue
686
687         if db.has_key(url + "|" + item["link"]):
688             data = db[url + "|" + item["link"]]
689             data = cgi.parse_qs(data)
690             if data.has_key("message-id"):
691                 prevmessageid = data["message-id"][0]
692             if data["contentmd5"][0] == md5sum:
693                 continue
694
695         try:
696             author = item["author"]
697         except:
698             author = url
699
700         # create a basic email message
701         msg = MIMEMultipart("alternative")
702         messageid = "<" \
703             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
704             + "." \
705             + "".join( \
706                 [random.choice( \
707                     string.ascii_letters + string.digits \
708                     ) for a in range(0,6) \
709                 ]) + "@" + socket.gethostname() + ">"
710         msg.add_header("Message-ID", messageid)
711         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
712         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
713         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
714         if prevmessageid:
715             msg.add_header("References", prevmessageid)
716         createddate = datetime.datetime.now() \
717             .strftime("%a, %e %b %Y %T -0000")
718         try:
719             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
720                 .strftime("%a, %e %b %Y %T -0000")
721         except:
722             pass
723         msg.add_header("Date", createddate)
724         subj_gen = HTML2Text()
725         subj_gen.feed(item["title"].encode("utf-8"))
726         msg.add_header("Subject", subj_gen.gettext())
727         msg.set_default_type("text/plain")
728
729         htmlcontent = content.encode("utf-8")
730         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
731             content, \
732             item["link"], \
733             item["link"] )
734         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
735         textparser = HTML2Text()
736         textparser.feed(content.encode("utf-8"))
737         textcontent = textparser.gettext()
738         textcontent = "%s\n\nItem URL: %s" %( \
739             textcontent, \
740             item["link"] )
741         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
742         msg.attach(textpart)
743         msg.attach(htmlpart)
744
745         # start by working out the filename we should be writting to, we do
746         # this following the normal maildir style rules
747         fname = str(os.getpid()) \
748             + "." + socket.gethostname() \
749             + "." + "".join( \
750                 [random.choice( \
751                     string.ascii_letters + string.digits \
752                     ) for a in range(0,10) \
753                 ]) + "." \
754             + datetime.datetime.now().strftime('%s')
755         fn = os.path.join(maildir, "tmp", fname)
756         fh = open(fn, "w")
757         fh.write(msg.as_string())
758         fh.close()
759         # now move it in to the new directory
760         newfn = os.path.join(maildir, "new", fname)
761         os.link(fn, newfn)
762         os.unlink(fn)
763
764         # now add to the database about the item
765         if prevmessageid:
766             messageid = prevmessageid + " " + messageid
767         if item.has_key("guid") and item["guid"] != item["link"]:
768             data = urllib.urlencode(( \
769                 ("message-id", messageid), \
770                 ("created", createddate), \
771                 ("contentmd5", md5sum) \
772                 ))
773             db[url + "|" + item["guid"]] = data
774             try:
775                 data = db[url + "|" + item["link"]]
776                 data = cgi.parse_qs(data)
777                 newdata = urllib.urlencode(( \
778                     ("message-id", messageid), \
779                     ("created", data["created"][0]), \
780                     ("contentmd5", data["contentmd5"][0]) \
781                     ))
782                 db[url + "|" + item["link"]] = newdata
783             except:
784                 db[url + "|" + item["link"]] = data
785         else:
786             data = urllib.urlencode(( \
787                 ("message-id", messageid), \
788                 ("created", createddate), \
789                 ("contentmd5", md5sum) \
790                 ))
791             db[url + "|" + item["link"]] = data
792
793     if headers:
794         data = []
795         for header in headers:
796             if header[0] in \
797                 ["content-md5", "etag", "last-modified", "content-length"]:
798                 data.append((header[0], header[1]))
799         if len(data) > 0:
800             data = urllib.urlencode(data)
801             feeddb[url] = data
802
803     db.close()
804     feeddb.close()
805
806 if __name__ == "__main__":
807     # This only gets executed if we really called the program
808     # first off, parse the command line arguments
809
810     oparser = OptionParser()
811     oparser.add_option(
812         "-c", "--conf", dest="conf",
813         help="location of config file"
814         )
815     oparser.add_option(
816         "-s", "--statedir", dest="statedir",
817         help="location of directory to store state in"
818         )
819
820     (options, args) = oparser.parse_args()
821
822     # check for the configfile
823
824     configfile = None
825
826     if options.conf != None:
827         # does the file exist?
828         try:
829             os.stat(options.conf)
830             configfile = options.conf
831         except:
832             # should exit here as the specified file doesn't exist
833             sys.stderr.write( \
834                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
835             sys.exit(2)
836     else:
837         # check through the default locations
838         try:
839             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
840             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
841         except:
842             try:
843                 os.stat("/etc/rss2maildir.conf")
844                 configfile = "/etc/rss2maildir.conf"
845             except:
846                 sys.stderr.write("No config file found. Exiting.\n")
847                 sys.exit(2)
848
849     # Right - if we've got this far, we've got a config file, now for the hard
850     # bits...
851
852     scp = SafeConfigParser()
853     scp.read(configfile)
854
855     maildir_root = "RSSMaildir"
856     state_dir = "state"
857
858     if options.statedir != None:
859         state_dir = options.statedir
860         try:
861             mode = os.stat(state_dir)[stat.ST_MODE]
862             if not stat.S_ISDIR(mode):
863                 sys.stderr.write( \
864                     "State directory (%s) is not a directory\n" %(state_dir))
865                 sys.exit(1)
866         except:
867             # try to make the directory
868             try:
869                 os.mkdir(state_dir)
870             except:
871                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
872                 sys.exit(1)
873     elif scp.has_option("general", "state_dir"):
874         new_state_dir = scp.get("general", "state_dir")
875         try:
876             mode = os.stat(new_state_dir)[stat.ST_MODE]
877             if not stat.S_ISDIR(mode):
878                 sys.stderr.write( \
879                     "State directory (%s) is not a directory\n" %(state_dir))
880                 sys.exit(1)
881             else:
882                 state_dir = new_state_dir
883         except:
884             # try to create it
885             try:
886                 os.mkdir(new_state_dir)
887                 state_dir = new_state_dir
888             except:
889                 sys.stderr.write( \
890                     "Couldn't create state directory %s\n" %(new_state_dir))
891                 sys.exit(1)
892     else:
893         try:
894             mode = os.stat(state_dir)[stat.ST_MODE]
895             if not stat.S_ISDIR(mode):
896                 sys.stderr.write( \
897                     "State directory %s is not a directory\n" %(state_dir))
898                 sys.exit(1)
899         except:
900             try:
901                 os.mkdir(state_dir)
902             except:
903                 sys.stderr.write( \
904                     "State directory %s could not be created\n" %(state_dir))
905                 sys.exit(1)
906
907     if scp.has_option("general", "maildir_root"):
908         maildir_root = scp.get("general", "maildir_root")
909
910     try:
911         mode = os.stat(maildir_root)[stat.ST_MODE]
912         if not stat.S_ISDIR(mode):
913             sys.stderr.write( \
914                 "Maildir Root %s is not a directory\n" \
915                 %(maildir_root))
916             sys.exit(1)
917     except:
918         try:
919             os.mkdir(maildir_root)
920         except:
921             sys.stderr.write("Couldn't create Maildir Root %s\n" \
922                 %(maildir_root))
923             sys.exit(1)
924
925     feeds = scp.sections()
926     try:
927         feeds.remove("general")
928     except:
929         pass
930
931     for section in feeds:
932         # check if the directory exists
933         maildir = None
934         try:
935             maildir = scp.get(section, "maildir")
936         except:
937             maildir = section
938
939         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
940         maildir = os.path.join(maildir_root, maildir)
941
942         try:
943             exists = os.stat(maildir)
944             if stat.S_ISDIR(exists[stat.ST_MODE]):
945                 # check if there's a new, cur and tmp directory
946                 try:
947                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
948                 except:
949                     os.mkdir(os.path.join(maildir, "cur"))
950                     if not stat.S_ISDIR(mode):
951                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
952                 try:
953                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
954                 except:
955                     os.mkdir(os.path.join(maildir, "tmp"))
956                     if not stat.S_ISDIR(mode):
957                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
958                 try:
959                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
960                     if not stat.S_ISDIR(mode):
961                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
962                 except:
963                     os.mkdir(os.path.join(maildir, "new"))
964             else:
965                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
966         except:
967             try:
968                 os.mkdir(maildir)
969             except:
970                 sys.stderr.write("Couldn't create root maildir %s\n" \
971                     %(maildir))
972                 sys.exit(1)
973             try:
974                 os.mkdir(os.path.join(maildir, "new"))
975                 os.mkdir(os.path.join(maildir, "cur"))
976                 os.mkdir(os.path.join(maildir, "tmp"))
977             except:
978                 sys.stderr.write( \
979                     "Couldn't create required maildir directories for %s\n" \
980                     %(section,))
981                 sys.exit(1)
982
983         # right - we've got the directories, we've got the section, we know the
984         # url... lets play!
985
986         parse_and_deliver(maildir, section, state_dir)