Fix some entity handling
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         u'mdash': u'—',
141         u'ndash': u'–',
142         u'sect': u'§',
143         u'para': u'¶',
144         u'uarr': u'↑',
145         u'darr': u'↓',
146         u'larr': u'←',
147         u'rarr': u'→',
148         u'dagger': u'†',
149         u'Dagger': u'‡',
150         u'permil': u'‰',
151         u'prod': u'∏',
152         u'infin': u'∞',
153         u'radic': u'√',
154         u'there4': u'∴',
155         u'int': u'∫',
156         u'asymp': u'≈',
157         u'ne': u'≠',
158         u'equiv': '≡',
159         u'le': u'≤',
160         u'ge': u'≥',
161         u'loz': u'⋄',
162         u'sum': u'∑',
163         u'part': u'∂',
164         u'prime': u'′',
165         u'Prime': u'″',
166         u'harr': u'↔',
167         u'micro': u'µ',
168         u'not': u'¬',
169         u'plusmn': u'±',
170         u'divide': u'÷',
171         u'cent': u'¢',
172         u'euro': u'€',
173         }
174
175     blockleveltags = [
176         u'h1',
177         u'h2',
178         u'h3',
179         u'h4',
180         u'h5',
181         u'h6',
182         u'pre',
183         u'p',
184         u'ul',
185         u'ol',
186         u'dl',
187         u'li',
188         u'dt',
189         u'dd',
190         u'div',
191         u'blockquote',
192         ]
193
194     liststarttags = [
195         u'ul',
196         u'ol',
197         u'dl',
198         ]
199
200     cancontainflow = [
201         u'div',
202         u'li',
203         u'dd',
204         u'blockquote',
205     ]
206
207     def __init__(self,textwidth=70):
208         self.text = u''
209         self.curdata = u''
210         self.textwidth = textwidth
211         self.opentags = []
212         self.indentlevel = 0
213         self.ignorenodata = False
214         self.listcount = []
215         self.urls = []
216         self.images = {}
217         HTMLParser.__init__(self)
218
219     def handle_starttag(self, tag, attrs):
220         tag_name = tag.lower()
221         if tag_name in self.blockleveltags:
222             # handle starting a new block - unless we're in a block element
223             # that can contain other blocks, we'll assume that we want to close
224             # the container
225             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226                 self.handle_curdata()
227
228             if tag_name == u'ol':
229                 self.handle_curdata()
230                 self.listcount.append(1)
231                 self.listlevel = len(self.listcount) - 1
232
233             if tag_name == u'dl':
234                 self.indentlevel = self.indentlevel + 4
235
236             if tag_name in self.liststarttags:
237                 smallist = self.opentags[-3:-1]
238                 smallist.reverse()
239                 for prev_listtag in smallist:
240                     if prev_listtag in [u'dl', u'ol']:
241                         self.indentlevel = self.indentlevel + 4
242                         break
243                     elif prev_listtag == u'ul':
244                         self.indentlevel = self.indentlevel + 3
245                         break
246
247             if len(self.opentags) > 0:
248                 self.handle_curdata()
249                 if tag_name not in self.cancontainflow:
250                     self.opentags.pop()
251             self.opentags.append(tag_name)
252         else:
253             if tag_name == "span":
254                 return
255             listcount = 0
256             try:
257                 listcount = self.listcount[-1]
258             except:
259                 pass
260
261             if tag_name == u'dd' and len(self.opentags) > 1 \
262                 and self.opentags[-1] == u'dt':
263                 self.handle_curdata()
264                 self.opentags.pop()
265             elif tag_name == u'dt' and len(self.opentags) > 1 \
266                 and self.opentags[-1] == u'dd':
267                 self.handle_curdata()
268                 self.opentags.pop()
269             elif tag_name == u'a':
270                 for attr in attrs:
271                     if attr[0].lower() == u'href':
272                         self.urls.append(attr[1].decode('utf-8'))
273                 self.curdata = self.curdata + u'`'
274                 self.opentags.append(tag_name)
275                 return
276             elif tag_name == u'img':
277                 self.handle_image(attrs)
278                 return
279             elif tag_name == u'br':
280                 self.handle_br()
281                 return
282             else:
283                 # we don't know the tag, so lets avoid handling it!
284                 return 
285
286     def handle_startendtag(self, tag, attrs):
287         if tag.lower() == u'br':
288             self.handle_br()
289         elif tag.lower() == u'img':
290             self.handle_image(attrs)
291             return
292
293     def handle_br(self):
294             self.handle_curdata()
295             self.opentags.append(u'br')
296             self.handle_curdata()
297             self.opentags.pop()
298
299     def handle_image(self, attrs):
300         alt = u''
301         url = u''
302         for attr in attrs:
303             if attr[0] == 'alt':
304                 alt = attr[1].decode('utf-8')
305             elif attr[0] == 'src':
306                 url = attr[1].decode('utf-8')
307         if url:
308             if alt:
309                 if self.images.has_key(alt):
310                     if self.images[alt]["url"] == url:
311                         self.curdata = self.curdata \
312                             + u'|%s|' %(alt,)
313                     else:
314                         while self.images.has_key(alt):
315                             alt = alt + "_"
316                         self.images[alt] = {"url": url}
317                         self.curdata = self.curdata \
318                             + u'|%s|' %(alt,)
319                 else:
320                     self.images[alt] = {"url": url}
321                     self.curdata = self.curdata \
322                         + u'|%s|' %(alt,)
323             else:
324                 if self.images.has_key(url):
325                     self.curdata = self.curdata \
326                         + u'|%s|' %(url,)
327                 else:
328                     self.images[url] = {}
329                     self.images[url]["url"] =url
330                     self.curdata = self.curdata \
331                         + u'|%s|' %(url,)
332
333     def handle_curdata(self):
334
335         if len(self.opentags) == 0:
336             return
337
338         tag_thats_done = self.opentags[-1]
339
340         if len(self.curdata) == 0:
341             return
342
343         if tag_thats_done == u'br':
344             if len(self.text) == 0 or self.text[-1] != '\n':
345                 self.text = self.text + '\n'
346                 self.ignorenodata = True
347             return
348
349         if len(self.curdata.strip()) == 0:
350             return
351
352         if tag_thats_done in self.blockleveltags:
353             newlinerequired = self.text != u''
354             if self.ignorenodata:
355                 newlinerequired = False
356             self.ignorenodata = False
357             if newlinerequired:
358                 if tag_thats_done in [u'dt', u'dd', u'li'] \
359                     and len(self.text) > 1 \
360                     and self.text[-1] != u'\n':
361                         self.text = self.text + u'\n'
362                 elif len(self.text) > 2 \
363                     and self.text[-1] != u'\n' \
364                     and self.text[-2] != u'\n':
365                     self.text = self.text + u'\n\n'
366
367         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
368             underline = u''
369             underlinechar = u'='
370             headingtext = " ".join(self.curdata.split())
371             seperator = u'\n' + u' '*self.indentlevel
372             headingtext = seperator.join( \
373                 textwrap.wrap( \
374                     headingtext, \
375                     self.textwidth - self.indentlevel \
376                     ) \
377                 )
378
379             if tag_thats_done == u'h2':
380                 underlinechar = u'-'
381             elif tag_thats_done != u'h1':
382                 underlinechar = u'~'
383
384             if u'\n' in headingtext:
385                 underline = u' ' * self.indentlevel \
386                     + underlinechar * (self.textwidth - self.indentlevel)
387             else:
388                 underline = u' ' * self.indentlevel \
389                     + underlinechar * len(headingtext)
390             self.text = self.text \
391                 + headingtext + u'\n' \
392                 + underline
393         elif tag_thats_done in [u'p', u'div']:
394             paragraph = unicode( \
395                 " ".join(self.curdata.strip().encode("utf-8").split()), \
396                 "utf-8")
397             seperator = u'\n' + u' ' * self.indentlevel
398             self.text = self.text \
399                 + u' ' * self.indentlevel \
400                 + seperator.join( \
401                     textwrap.wrap( \
402                         paragraph, self.textwidth - self.indentlevel))
403         elif tag_thats_done == "pre":
404             self.text = self.text + unicode( \
405                 self.curdata.encode("utf-8"), "utf-8")
406         elif tag_thats_done == u'blockquote':
407             quote = unicode( \
408                 " ".join(self.curdata.encode("utf-8").strip().split()), \
409                 "utf-8")
410             seperator = u'\n' + u' ' * self.indentlevel + u'    '
411             if len(self.text) > 0 and self.text[-1] != u'\n':
412                 self.text = self.text + u'\n'
413             self.text = self.text \
414                 + u'    ' \
415                 + seperator.join( \
416                     textwrap.wrap( \
417                         quote, \
418                         self.textwidth - self.indentlevel - 2 \
419                     )
420                 )
421             self.curdata = u''
422         elif tag_thats_done == "li":
423             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
424             if len(self.text) > 0 and self.text[-1] != u'\n':
425                 self.text = self.text + u'\n'
426             # work out if we're in an ol rather than a ul
427             latesttags = self.opentags[-4:]
428             latesttags.reverse()
429             isul = None
430             for thing in latesttags:
431                 if thing == 'ul':
432                     isul = True
433                     break
434                 elif thing == 'ol':
435                     isul = False
436                     break
437
438             listindent = 3
439             if not isul:
440                 listindent = 4
441
442             listmarker = u' * '
443             if isul == False:
444                 listmarker = u' %2d. ' %(self.listcount[-1])
445                 self.listcount[-1] = self.listcount[-1] + 1
446
447             seperator = u'\n' \
448                 + u' ' * self.indentlevel \
449                 + u' ' * listindent
450             self.text = self.text \
451                 + u' ' * self.indentlevel \
452                 + listmarker \
453                 + seperator.join( \
454                     textwrap.wrap( \
455                         item, \
456                         self.textwidth - self.indentlevel - listindent \
457                     ) \
458                 )
459             self.curdata = u''
460         elif tag_thats_done == u'dt':
461             definition = unicode(" ".join( \
462                     self.curdata.encode("utf-8").strip().split()), \
463                 "utf-8")
464             if len(self.text) > 0 and self.text[-1] != u'\n':
465                 self.text = self.text + u'\n\n'
466             elif len(self.text) > 1 and self.text[-2] != u'\n':
467                 self.text = self.text + u'\n'
468             definition = u' ' * (self.indentlevel - 4) + definition + "::"
469             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
470             self.text = self.text \
471                 + indentstring.join(
472                     textwrap.wrap(definition, \
473                         self.textwidth - self.indentlevel - 4))
474             self.curdata = u''
475         elif tag_thats_done == u'dd':
476             definition = unicode(" ".join( \
477                     self.curdata.encode("utf-8").strip().split()),
478                 "utf-8")
479             if len(definition) > 0:
480                 if len(self.text) > 0 and self.text[-1] != u'\n':
481                     self.text = self.text + u'\n'
482                 indentstring = u'\n' + u' ' * self.indentlevel
483                 self.text = self.text \
484                     + indentstring \
485                     + indentstring.join( \
486                         textwrap.wrap( \
487                             definition, \
488                             self.textwidth - self.indentlevel \
489                             ) \
490                         )
491                 self.curdata = u''
492         elif tag_thats_done == u'a':
493             self.curdata = self.curdata + u'`__'
494             pass
495         elif tag_thats_done in self.liststarttags:
496             pass
497
498         if tag_thats_done in self.blockleveltags:
499             self.curdata = u''
500
501         self.ignorenodata = False
502
503     def handle_endtag(self, tag):
504         self.ignorenodata = False
505         if tag == "span":
506             return
507
508         try:
509             tagindex = self.opentags.index(tag)
510         except:
511             return
512         tag = tag.lower()
513
514         if tag in [u'br', u'img']:
515             return
516
517         if tag == u'dl':
518             self.indentlevel = self.indentlevel - 4
519
520         if tag in self.liststarttags:
521             if tag in [u'ol', u'dl', u'ul', u'dd']:
522                 self.handle_curdata()
523                 # find if there was a previous list level
524                 smalllist = self.opentags[:-1]
525                 smalllist.reverse()
526                 for prev_listtag in smalllist:
527                     if prev_listtag in [u'ol', u'dl']:
528                         self.indentlevel = self.indentlevel - 4
529                         break
530                     elif prev_listtag == u'ul':
531                         self.indentlevel = self.indentlevel - 3
532                         break
533
534         if tag == u'ol':
535             self.listcount = self.listcount[:-1]
536
537         while tagindex < len(self.opentags) \
538             and tag in self.opentags[tagindex+1:]:
539             try:
540                 tagindex = self.opentags.index(tag, tagindex+1)
541             except:
542                 # well, we don't want to do that then
543                 pass
544         if tagindex != len(self.opentags) - 1:
545             # Assuming the data was for the last opened tag first
546             self.handle_curdata()
547             # Now kill the list to be a slice before this tag was opened
548             self.opentags = self.opentags[:tagindex + 1]
549         else:
550             self.handle_curdata()
551             if self.opentags[-1] == tag:
552                 self.opentags.pop()
553
554     def handle_data(self, data):
555         if len(self.opentags) == 0:
556             self.opentags.append(u'p')
557         self.curdata = self.curdata + data.decode("utf-8")
558
559     def handle_charref(self, name):
560         entity = unichr(int(name))
561         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
562             "utf-8")
563
564     def handle_entityref(self, name):
565         entity = name
566         if HTML2Text.entities.has_key(name):
567             entity = HTML2Text.entities[name]
568         else:
569             entity = "&" + name + ";"
570
571         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
572             "utf-8")
573
574     def gettext(self):
575         self.handle_curdata()
576         if len(self.text) == 0 or self.text[-1] != u'\n':
577             self.text = self.text + u'\n'
578         self.opentags = []
579         if len(self.text) > 0:
580             while len(self.text) > 1 and self.text[-1] == u'\n':
581                 self.text = self.text[:-1]
582             self.text = self.text + u'\n'
583         if len(self.urls) > 0:
584             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
585             self.urls = []
586         if len(self.images.keys()) > 0:
587             self.text = self.text + u'\n.. ' \
588                 + u'\n.. '.join( \
589                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
590                 for a in self.images.keys()]) + u'\n'
591             self.images = {}
592         return self.text
593
594 def open_url(method, url):
595     redirectcount = 0
596     while redirectcount < 3:
597         (type, rest) = urllib.splittype(url)
598         (host, path) = urllib.splithost(rest)
599         (host, port) = urllib.splitport(host)
600         if port == None:
601             port = 80
602         try:
603             conn = httplib.HTTPConnection("%s:%s" %(host, port))
604             conn.request(method, path)
605             response = conn.getresponse()
606             if response.status in [301, 302, 303, 307]:
607                 headers = response.getheaders()
608                 for header in headers:
609                     if header[0] == "location":
610                         url = header[1]
611             elif response.status == 200:
612                 return response
613         except:
614             pass
615         redirectcount = redirectcount + 1
616     return None
617
618 def parse_and_deliver(maildir, url, statedir):
619     feedhandle = None
620     headers = None
621     # first check if we know about this feed already
622     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
623     if feeddb.has_key(url):
624         data = feeddb[url]
625         data = cgi.parse_qs(data)
626         response = open_url("HEAD", url)
627         headers = None
628         if response:
629             headers = response.getheaders()
630         ischanged = False
631         try:
632             for header in headers:
633                 if header[0] == "content-length":
634                     if header[1] != data["content-length"][0]:
635                         ischanged = True
636                 elif header[0] == "etag":
637                     if header[1] != data["etag"][0]:
638                         ischanged = True
639                 elif header[0] == "last-modified":
640                     if header[1] != data["last-modified"][0]:
641                         ischanged = True
642                 elif header[0] == "content-md5":
643                     if header[1] != data["content-md5"][0]:
644                         ischanged = True
645         except:
646             ischanged = True
647         if ischanged:
648             response = open_url("GET", url)
649             if response != None:
650                 headers = response.getheaders()
651                 feedhandle = response
652             else:
653                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
654                 return
655         else:
656             return # don't need to do anything, nothings changed.
657     else:
658         response = open_url("GET", url)
659         if response != None:
660             headers = response.getheaders()
661             feedhandle = response
662         else:
663             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
664             return
665
666     fp = feedparser.parse(feedhandle)
667     db = dbm.open(os.path.join(statedir, "seen"), "c")
668     for item in fp["items"]:
669         # have we seen it before?
670         # need to work out what the content is first...
671
672         if item.has_key("content"):
673             content = item["content"][0]["value"]
674         else:
675             content = item["summary"]
676
677         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
678
679         prevmessageid = None
680
681         # check if there's a guid too - if that exists and we match the md5,
682         # return
683         if item.has_key("guid"):
684             if db.has_key(url + "|" + item["guid"]):
685                 data = db[url + "|" + item["guid"]]
686                 data = cgi.parse_qs(data)
687                 if data["contentmd5"][0] == md5sum:
688                     continue
689
690         if db.has_key(url + "|" + item["link"]):
691             data = db[url + "|" + item["link"]]
692             data = cgi.parse_qs(data)
693             if data.has_key("message-id"):
694                 prevmessageid = data["message-id"][0]
695             if data["contentmd5"][0] == md5sum:
696                 continue
697
698         try:
699             author = item["author"]
700         except:
701             author = url
702
703         # create a basic email message
704         msg = MIMEMultipart("alternative")
705         messageid = "<" \
706             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
707             + "." \
708             + "".join( \
709                 [random.choice( \
710                     string.ascii_letters + string.digits \
711                     ) for a in range(0,6) \
712                 ]) + "@" + socket.gethostname() + ">"
713         msg.add_header("Message-ID", messageid)
714         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
715         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
716         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
717         if prevmessageid:
718             msg.add_header("References", prevmessageid)
719         createddate = datetime.datetime.now() \
720             .strftime("%a, %e %b %Y %T -0000")
721         try:
722             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
723                 .strftime("%a, %e %b %Y %T -0000")
724         except:
725             pass
726         msg.add_header("Date", createddate)
727         subj_gen = HTML2Text()
728         subj_gen.feed(item["title"].encode("utf-8"))
729         msg.add_header("Subject", subj_gen.gettext())
730         msg.set_default_type("text/plain")
731
732         htmlcontent = content.encode("utf-8")
733         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
734             content, \
735             item["link"], \
736             item["link"] )
737         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
738         textparser = HTML2Text()
739         textparser.feed(content.encode("utf-8"))
740         textcontent = textparser.gettext()
741         textcontent = "%s\n\nItem URL: %s" %( \
742             textcontent, \
743             item["link"] )
744         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
745         msg.attach(textpart)
746         msg.attach(htmlpart)
747
748         # start by working out the filename we should be writting to, we do
749         # this following the normal maildir style rules
750         fname = str(os.getpid()) \
751             + "." + socket.gethostname() \
752             + "." + "".join( \
753                 [random.choice( \
754                     string.ascii_letters + string.digits \
755                     ) for a in range(0,10) \
756                 ]) + "." \
757             + datetime.datetime.now().strftime('%s')
758         fn = os.path.join(maildir, "tmp", fname)
759         fh = open(fn, "w")
760         fh.write(msg.as_string())
761         fh.close()
762         # now move it in to the new directory
763         newfn = os.path.join(maildir, "new", fname)
764         os.link(fn, newfn)
765         os.unlink(fn)
766
767         # now add to the database about the item
768         if prevmessageid:
769             messageid = prevmessageid + " " + messageid
770         if item.has_key("guid") and item["guid"] != item["link"]:
771             data = urllib.urlencode(( \
772                 ("message-id", messageid), \
773                 ("created", createddate), \
774                 ("contentmd5", md5sum) \
775                 ))
776             db[url + "|" + item["guid"]] = data
777             try:
778                 data = db[url + "|" + item["link"]]
779                 data = cgi.parse_qs(data)
780                 newdata = urllib.urlencode(( \
781                     ("message-id", messageid), \
782                     ("created", data["created"][0]), \
783                     ("contentmd5", data["contentmd5"][0]) \
784                     ))
785                 db[url + "|" + item["link"]] = newdata
786             except:
787                 db[url + "|" + item["link"]] = data
788         else:
789             data = urllib.urlencode(( \
790                 ("message-id", messageid), \
791                 ("created", createddate), \
792                 ("contentmd5", md5sum) \
793                 ))
794             db[url + "|" + item["link"]] = data
795
796     if headers:
797         data = []
798         for header in headers:
799             if header[0] in \
800                 ["content-md5", "etag", "last-modified", "content-length"]:
801                 data.append((header[0], header[1]))
802         if len(data) > 0:
803             data = urllib.urlencode(data)
804             feeddb[url] = data
805
806     db.close()
807     feeddb.close()
808
809 if __name__ == "__main__":
810     # This only gets executed if we really called the program
811     # first off, parse the command line arguments
812
813     oparser = OptionParser()
814     oparser.add_option(
815         "-c", "--conf", dest="conf",
816         help="location of config file"
817         )
818     oparser.add_option(
819         "-s", "--statedir", dest="statedir",
820         help="location of directory to store state in"
821         )
822
823     (options, args) = oparser.parse_args()
824
825     # check for the configfile
826
827     configfile = None
828
829     if options.conf != None:
830         # does the file exist?
831         try:
832             os.stat(options.conf)
833             configfile = options.conf
834         except:
835             # should exit here as the specified file doesn't exist
836             sys.stderr.write( \
837                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
838             sys.exit(2)
839     else:
840         # check through the default locations
841         try:
842             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
843             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
844         except:
845             try:
846                 os.stat("/etc/rss2maildir.conf")
847                 configfile = "/etc/rss2maildir.conf"
848             except:
849                 sys.stderr.write("No config file found. Exiting.\n")
850                 sys.exit(2)
851
852     # Right - if we've got this far, we've got a config file, now for the hard
853     # bits...
854
855     scp = SafeConfigParser()
856     scp.read(configfile)
857
858     maildir_root = "RSSMaildir"
859     state_dir = "state"
860
861     if options.statedir != None:
862         state_dir = options.statedir
863         try:
864             mode = os.stat(state_dir)[stat.ST_MODE]
865             if not stat.S_ISDIR(mode):
866                 sys.stderr.write( \
867                     "State directory (%s) is not a directory\n" %(state_dir))
868                 sys.exit(1)
869         except:
870             # try to make the directory
871             try:
872                 os.mkdir(state_dir)
873             except:
874                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
875                 sys.exit(1)
876     elif scp.has_option("general", "state_dir"):
877         new_state_dir = scp.get("general", "state_dir")
878         try:
879             mode = os.stat(new_state_dir)[stat.ST_MODE]
880             if not stat.S_ISDIR(mode):
881                 sys.stderr.write( \
882                     "State directory (%s) is not a directory\n" %(state_dir))
883                 sys.exit(1)
884             else:
885                 state_dir = new_state_dir
886         except:
887             # try to create it
888             try:
889                 os.mkdir(new_state_dir)
890                 state_dir = new_state_dir
891             except:
892                 sys.stderr.write( \
893                     "Couldn't create state directory %s\n" %(new_state_dir))
894                 sys.exit(1)
895     else:
896         try:
897             mode = os.stat(state_dir)[stat.ST_MODE]
898             if not stat.S_ISDIR(mode):
899                 sys.stderr.write( \
900                     "State directory %s is not a directory\n" %(state_dir))
901                 sys.exit(1)
902         except:
903             try:
904                 os.mkdir(state_dir)
905             except:
906                 sys.stderr.write( \
907                     "State directory %s could not be created\n" %(state_dir))
908                 sys.exit(1)
909
910     if scp.has_option("general", "maildir_root"):
911         maildir_root = scp.get("general", "maildir_root")
912
913     try:
914         mode = os.stat(maildir_root)[stat.ST_MODE]
915         if not stat.S_ISDIR(mode):
916             sys.stderr.write( \
917                 "Maildir Root %s is not a directory\n" \
918                 %(maildir_root))
919             sys.exit(1)
920     except:
921         try:
922             os.mkdir(maildir_root)
923         except:
924             sys.stderr.write("Couldn't create Maildir Root %s\n" \
925                 %(maildir_root))
926             sys.exit(1)
927
928     feeds = scp.sections()
929     try:
930         feeds.remove("general")
931     except:
932         pass
933
934     for section in feeds:
935         # check if the directory exists
936         maildir = None
937         try:
938             maildir = scp.get(section, "maildir")
939         except:
940             maildir = section
941
942         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
943         maildir = os.path.join(maildir_root, maildir)
944
945         try:
946             exists = os.stat(maildir)
947             if stat.S_ISDIR(exists[stat.ST_MODE]):
948                 # check if there's a new, cur and tmp directory
949                 try:
950                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
951                 except:
952                     os.mkdir(os.path.join(maildir, "cur"))
953                     if not stat.S_ISDIR(mode):
954                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
955                 try:
956                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
957                 except:
958                     os.mkdir(os.path.join(maildir, "tmp"))
959                     if not stat.S_ISDIR(mode):
960                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
961                 try:
962                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
963                     if not stat.S_ISDIR(mode):
964                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
965                 except:
966                     os.mkdir(os.path.join(maildir, "new"))
967             else:
968                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
969         except:
970             try:
971                 os.mkdir(maildir)
972             except:
973                 sys.stderr.write("Couldn't create root maildir %s\n" \
974                     %(maildir))
975                 sys.exit(1)
976             try:
977                 os.mkdir(os.path.join(maildir, "new"))
978                 os.mkdir(os.path.join(maildir, "cur"))
979                 os.mkdir(os.path.join(maildir, "tmp"))
980             except:
981                 sys.stderr.write( \
982                     "Couldn't create required maildir directories for %s\n" \
983                     %(section,))
984                 sys.exit(1)
985
986         # right - we've got the directories, we've got the section, we know the
987         # url... lets play!
988
989         parse_and_deliver(maildir, section, state_dir)