Fix bug when link/guid contains characters not in ascii by encoding the keys as utf-8
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 import re
48
49 from HTMLParser import HTMLParser
50
51 class HTML2Text(HTMLParser):
52     entities = {
53         u'amp': u'&',
54         u'lt': u'<',
55         u'gt': u'>',
56         u'pound': u'£',
57         u'copy': u'©',
58         u'apos': u'\'',
59         u'quot': u'"',
60         u'nbsp': u' ',
61         u'ldquo': u'“',
62         u'rdquo': u'”',
63         u'lsquo': u'‘',
64         u'rsquo': u'’',
65         u'laquo': u'«',
66         u'raquo': u'»',
67         u'lsaquo': u'‹',
68         u'rsaquo': u'›',
69         u'bull': u'•',
70         u'middot': u'·',
71         u'deg': u'°',
72         u'helip': u'…',
73         u'trade': u'™',
74         u'reg': u'®',
75         u'agrave': u'à',
76         u'Agrave': u'À',
77         u'egrave': u'è',
78         u'Egrave': u'È',
79         u'igrave': u'ì',
80         u'Igrave': u'Ì',
81         u'ograve': u'ò',
82         u'Ograve': u'Ò',
83         u'ugrave': u'ù',
84         u'Ugrave': u'Ù',
85         u'aacute': u'á',
86         u'Aacute': u'Á',
87         u'eacute': u'é',
88         u'Eacute': u'É',
89         u'iacute': u'í',
90         u'Iacute': u'Í',
91         u'oacute': u'ó',
92         u'Oacute': u'Ó',
93         u'uacute': u'ú',
94         u'Uacute': u'Ú',
95         u'yactue': u'ý',
96         u'Yacute': u'Ý',
97         u'acirc': u'â',
98         u'Acirc': u'Â',
99         u'ecirc': u'ê',
100         u'Ecirc': u'Ê',
101         u'icirc': u'î',
102         u'Icirc': u'Î',
103         u'ocirc': u'ô',
104         u'Ocirc': u'Ô',
105         u'ucirc': u'û',
106         u'Ucirc': u'Û',
107         u'atilde': u'ã',
108         u'Atilde': u'Ã',
109         u'ntilde': u'ñ',
110         u'Ntilde': u'Ñ',
111         u'otilde': u'õ',
112         u'Otilde': u'Õ',
113         u'auml': u'ä',
114         u'Auml': u'Ä',
115         u'euml': u'ë',
116         u'Euml': u'Ë',
117         u'iuml': u'ï',
118         u'Iuml': u'Ï',
119         u'ouml': u'ö',
120         u'Ouml': u'Ö',
121         u'uuml': u'ü',
122         u'Uuml': u'Ü',
123         u'yuml': u'ÿ',
124         u'Yuml': u'Ÿ',
125         u'iexcl': u'¡',
126         u'iquest': u'¿',
127         u'ccedil': u'ç',
128         u'Ccedil': u'Ç',
129         u'oelig': u'œ',
130         u'OElig': u'Œ',
131         u'szlig': u'ß',
132         u'oslash': u'ø',
133         u'Oslash': u'Ø',
134         u'aring': u'å',
135         u'Aring': u'Å',
136         u'aelig': u'æ',
137         u'AElig': u'Æ',
138         u'thorn': u'þ',
139         u'THORN': u'Þ',
140         u'eth': u'ð',
141         u'ETH': u'Ð',
142         u'mdash': u'—',
143         u'ndash': u'–',
144         u'sect': u'§',
145         u'para': u'¶',
146         u'uarr': u'↑',
147         u'darr': u'↓',
148         u'larr': u'←',
149         u'rarr': u'→',
150         u'dagger': u'†',
151         u'Dagger': u'‡',
152         u'permil': u'‰',
153         u'prod': u'∏',
154         u'infin': u'∞',
155         u'radic': u'√',
156         u'there4': u'∴',
157         u'int': u'∫',
158         u'asymp': u'≈',
159         u'ne': u'≠',
160         u'equiv': '≡',
161         u'le': u'≤',
162         u'ge': u'≥',
163         u'loz': u'⋄',
164         u'sum': u'∑',
165         u'part': u'∂',
166         u'prime': u'′',
167         u'Prime': u'″',
168         u'harr': u'↔',
169         u'micro': u'µ',
170         u'not': u'¬',
171         u'plusmn': u'±',
172         u'divide': u'÷',
173         u'cent': u'¢',
174         u'euro': u'€',
175         }
176
177     blockleveltags = [
178         u'h1',
179         u'h2',
180         u'h3',
181         u'h4',
182         u'h5',
183         u'h6',
184         u'pre',
185         u'p',
186         u'ul',
187         u'ol',
188         u'dl',
189         u'li',
190         u'dt',
191         u'dd',
192         u'div',
193         u'blockquote',
194         ]
195
196     liststarttags = [
197         u'ul',
198         u'ol',
199         u'dl',
200         ]
201
202     cancontainflow = [
203         u'div',
204         u'li',
205         u'dd',
206         u'blockquote',
207     ]
208
209     def __init__(self,textwidth=70):
210         self.text = u''
211         self.curdata = u''
212         self.textwidth = textwidth
213         self.opentags = []
214         self.indentlevel = 0
215         self.ignorenodata = False
216         self.listcount = []
217         self.urls = []
218         self.images = {}
219         HTMLParser.__init__(self)
220
221     def handle_starttag(self, tag, attrs):
222         tag_name = tag.lower()
223         if tag_name in self.blockleveltags:
224             # handle starting a new block - unless we're in a block element
225             # that can contain other blocks, we'll assume that we want to close
226             # the container
227             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
228                 self.handle_curdata()
229
230             if tag_name == u'ol':
231                 self.handle_curdata()
232                 self.listcount.append(1)
233                 self.listlevel = len(self.listcount) - 1
234
235             if tag_name == u'dl':
236                 self.indentlevel = self.indentlevel + 4
237
238             if tag_name in self.liststarttags:
239                 smallist = self.opentags[-3:-1]
240                 smallist.reverse()
241                 for prev_listtag in smallist:
242                     if prev_listtag in [u'dl', u'ol']:
243                         self.indentlevel = self.indentlevel + 4
244                         break
245                     elif prev_listtag == u'ul':
246                         self.indentlevel = self.indentlevel + 3
247                         break
248
249             if len(self.opentags) > 0:
250                 self.handle_curdata()
251                 if tag_name not in self.cancontainflow:
252                     self.opentags.pop()
253             self.opentags.append(tag_name)
254         else:
255             if tag_name == "span":
256                 return
257             listcount = 0
258             try:
259                 listcount = self.listcount[-1]
260             except:
261                 pass
262
263             if tag_name == u'dd' and len(self.opentags) > 1 \
264                 and self.opentags[-1] == u'dt':
265                 self.handle_curdata()
266                 self.opentags.pop()
267             elif tag_name == u'dt' and len(self.opentags) > 1 \
268                 and self.opentags[-1] == u'dd':
269                 self.handle_curdata()
270                 self.opentags.pop()
271             elif tag_name == u'a':
272                 for attr in attrs:
273                     if attr[0].lower() == u'href':
274                         self.urls.append(attr[1].decode('utf-8'))
275                 self.curdata = self.curdata + u'`'
276                 self.opentags.append(tag_name)
277                 return
278             elif tag_name == u'img':
279                 self.handle_image(attrs)
280                 return
281             elif tag_name == u'br':
282                 self.handle_br()
283                 return
284             else:
285                 # we don't know the tag, so lets avoid handling it!
286                 return 
287
288     def handle_startendtag(self, tag, attrs):
289         if tag.lower() == u'br':
290             self.handle_br()
291         elif tag.lower() == u'img':
292             self.handle_image(attrs)
293             return
294
295     def handle_br(self):
296             self.handle_curdata()
297             self.opentags.append(u'br')
298             self.handle_curdata()
299             self.opentags.pop()
300
301     def handle_image(self, attrs):
302         alt = u''
303         url = u''
304         for attr in attrs:
305             if attr[0] == 'alt':
306                 alt = attr[1].decode('utf-8')
307             elif attr[0] == 'src':
308                 url = attr[1].decode('utf-8')
309         if url:
310             if alt:
311                 if self.images.has_key(alt):
312                     if self.images[alt]["url"] == url:
313                         self.curdata = self.curdata \
314                             + u'|%s|' %(alt,)
315                     else:
316                         while self.images.has_key(alt):
317                             alt = alt + "_"
318                         self.images[alt] = {"url": url}
319                         self.curdata = self.curdata \
320                             + u'|%s|' %(alt,)
321                 else:
322                     self.images[alt] = {"url": url}
323                     self.curdata = self.curdata \
324                         + u'|%s|' %(alt,)
325             else:
326                 if self.images.has_key(url):
327                     self.curdata = self.curdata \
328                         + u'|%s|' %(url,)
329                 else:
330                     self.images[url] = {}
331                     self.images[url]["url"] =url
332                     self.curdata = self.curdata \
333                         + u'|%s|' %(url,)
334
335     def handle_curdata(self):
336
337         if len(self.opentags) == 0:
338             return
339
340         tag_thats_done = self.opentags[-1]
341
342         if len(self.curdata) == 0:
343             return
344
345         if tag_thats_done == u'br':
346             if len(self.text) == 0 or self.text[-1] != '\n':
347                 self.text = self.text + '\n'
348                 self.ignorenodata = True
349             return
350
351         if len(self.curdata.strip()) == 0:
352             return
353
354         if tag_thats_done in self.blockleveltags:
355             newlinerequired = self.text != u''
356             if self.ignorenodata:
357                 newlinerequired = False
358             self.ignorenodata = False
359             if newlinerequired:
360                 if tag_thats_done in [u'dt', u'dd', u'li'] \
361                     and len(self.text) > 1 \
362                     and self.text[-1] != u'\n':
363                         self.text = self.text + u'\n'
364                 elif len(self.text) > 2 \
365                     and self.text[-1] != u'\n' \
366                     and self.text[-2] != u'\n':
367                     self.text = self.text + u'\n\n'
368
369         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
370             underline = u''
371             underlinechar = u'='
372             headingtext = " ".join(self.curdata.split())
373             seperator = u'\n' + u' '*self.indentlevel
374             headingtext = seperator.join( \
375                 textwrap.wrap( \
376                     headingtext, \
377                     self.textwidth - self.indentlevel \
378                     ) \
379                 )
380
381             if tag_thats_done == u'h2':
382                 underlinechar = u'-'
383             elif tag_thats_done != u'h1':
384                 underlinechar = u'~'
385
386             if u'\n' in headingtext:
387                 underline = u' ' * self.indentlevel \
388                     + underlinechar * (self.textwidth - self.indentlevel)
389             else:
390                 underline = u' ' * self.indentlevel \
391                     + underlinechar * len(headingtext)
392             self.text = self.text \
393                 + headingtext + u'\n' \
394                 + underline
395         elif tag_thats_done in [u'p', u'div']:
396             paragraph = unicode( \
397                 " ".join(self.curdata.strip().encode("utf-8").split()), \
398                 "utf-8")
399             seperator = u'\n' + u' ' * self.indentlevel
400             self.text = self.text \
401                 + u' ' * self.indentlevel \
402                 + seperator.join( \
403                     textwrap.wrap( \
404                         paragraph, self.textwidth - self.indentlevel))
405         elif tag_thats_done == "pre":
406             self.text = self.text + unicode( \
407                 self.curdata.encode("utf-8"), "utf-8")
408         elif tag_thats_done == u'blockquote':
409             quote = unicode( \
410                 " ".join(self.curdata.encode("utf-8").strip().split()), \
411                 "utf-8")
412             seperator = u'\n' + u' ' * self.indentlevel + u'    '
413             if len(self.text) > 0 and self.text[-1] != u'\n':
414                 self.text = self.text + u'\n'
415             self.text = self.text \
416                 + u'    ' \
417                 + seperator.join( \
418                     textwrap.wrap( \
419                         quote, \
420                         self.textwidth - self.indentlevel - 2 \
421                     )
422                 )
423             self.curdata = u''
424         elif tag_thats_done == "li":
425             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
426             if len(self.text) > 0 and self.text[-1] != u'\n':
427                 self.text = self.text + u'\n'
428             # work out if we're in an ol rather than a ul
429             latesttags = self.opentags[-4:]
430             latesttags.reverse()
431             isul = None
432             for thing in latesttags:
433                 if thing == 'ul':
434                     isul = True
435                     break
436                 elif thing == 'ol':
437                     isul = False
438                     break
439
440             listindent = 3
441             if not isul:
442                 listindent = 4
443
444             listmarker = u' * '
445             if isul == False:
446                 listmarker = u' %2d. ' %(self.listcount[-1])
447                 self.listcount[-1] = self.listcount[-1] + 1
448
449             seperator = u'\n' \
450                 + u' ' * self.indentlevel \
451                 + u' ' * listindent
452             self.text = self.text \
453                 + u' ' * self.indentlevel \
454                 + listmarker \
455                 + seperator.join( \
456                     textwrap.wrap( \
457                         item, \
458                         self.textwidth - self.indentlevel - listindent \
459                     ) \
460                 )
461             self.curdata = u''
462         elif tag_thats_done == u'dt':
463             definition = unicode(" ".join( \
464                     self.curdata.encode("utf-8").strip().split()), \
465                 "utf-8")
466             if len(self.text) > 0 and self.text[-1] != u'\n':
467                 self.text = self.text + u'\n\n'
468             elif len(self.text) > 1 and self.text[-2] != u'\n':
469                 self.text = self.text + u'\n'
470             definition = u' ' * (self.indentlevel - 4) + definition + "::"
471             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
472             self.text = self.text \
473                 + indentstring.join(
474                     textwrap.wrap(definition, \
475                         self.textwidth - self.indentlevel - 4))
476             self.curdata = u''
477         elif tag_thats_done == u'dd':
478             definition = unicode(" ".join( \
479                     self.curdata.encode("utf-8").strip().split()),
480                 "utf-8")
481             if len(definition) > 0:
482                 if len(self.text) > 0 and self.text[-1] != u'\n':
483                     self.text = self.text + u'\n'
484                 indentstring = u'\n' + u' ' * self.indentlevel
485                 self.text = self.text \
486                     + indentstring \
487                     + indentstring.join( \
488                         textwrap.wrap( \
489                             definition, \
490                             self.textwidth - self.indentlevel \
491                             ) \
492                         )
493                 self.curdata = u''
494         elif tag_thats_done == u'a':
495             self.curdata = self.curdata + u'`__'
496             pass
497         elif tag_thats_done in self.liststarttags:
498             pass
499
500         if tag_thats_done in self.blockleveltags:
501             self.curdata = u''
502
503         self.ignorenodata = False
504
505     def handle_endtag(self, tag):
506         self.ignorenodata = False
507         if tag == "span":
508             return
509
510         try:
511             tagindex = self.opentags.index(tag)
512         except:
513             return
514         tag = tag.lower()
515
516         if tag in [u'br', u'img']:
517             return
518
519         if tag == u'dl':
520             self.indentlevel = self.indentlevel - 4
521
522         if tag in self.liststarttags:
523             if tag in [u'ol', u'dl', u'ul', u'dd']:
524                 self.handle_curdata()
525                 # find if there was a previous list level
526                 smalllist = self.opentags[:-1]
527                 smalllist.reverse()
528                 for prev_listtag in smalllist:
529                     if prev_listtag in [u'ol', u'dl']:
530                         self.indentlevel = self.indentlevel - 4
531                         break
532                     elif prev_listtag == u'ul':
533                         self.indentlevel = self.indentlevel - 3
534                         break
535
536         if tag == u'ol':
537             self.listcount = self.listcount[:-1]
538
539         while tagindex < len(self.opentags) \
540             and tag in self.opentags[tagindex+1:]:
541             try:
542                 tagindex = self.opentags.index(tag, tagindex+1)
543             except:
544                 # well, we don't want to do that then
545                 pass
546         if tagindex != len(self.opentags) - 1:
547             # Assuming the data was for the last opened tag first
548             self.handle_curdata()
549             # Now kill the list to be a slice before this tag was opened
550             self.opentags = self.opentags[:tagindex + 1]
551         else:
552             self.handle_curdata()
553             if self.opentags[-1] == tag:
554                 self.opentags.pop()
555
556     def handle_data(self, data):
557         if len(self.opentags) == 0:
558             self.opentags.append(u'p')
559         self.curdata = self.curdata + data.decode("utf-8")
560
561     def handle_charref(self, name):
562         try:
563             entity = unichr(int(name))
564         except:
565             if name[0] == 'x':
566                 try:
567                     entity = unichr(int('0%s' %(name,), 16))
568                 except:
569                     entity = u'#%s' %(name,)
570             else:
571                 entity = u'#%s' %(name,)
572         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
573             "utf-8")
574
575     def handle_entityref(self, name):
576         entity = name
577         if HTML2Text.entities.has_key(name):
578             entity = HTML2Text.entities[name]
579         else:
580             entity = "&" + name + ";"
581
582         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
583             "utf-8")
584
585     def gettext(self):
586         self.handle_curdata()
587         if len(self.text) == 0 or self.text[-1] != u'\n':
588             self.text = self.text + u'\n'
589         self.opentags = []
590         if len(self.text) > 0:
591             while len(self.text) > 1 and self.text[-1] == u'\n':
592                 self.text = self.text[:-1]
593             self.text = self.text + u'\n'
594         if len(self.urls) > 0:
595             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
596             self.urls = []
597         if len(self.images.keys()) > 0:
598             self.text = self.text + u'\n.. ' \
599                 + u'\n.. '.join( \
600                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
601                 for a in self.images.keys()]) + u'\n'
602             self.images = {}
603         return self.text
604
605 def open_url(method, url):
606     redirectcount = 0
607     while redirectcount < 3:
608         (type, rest) = urllib.splittype(url)
609         (host, path) = urllib.splithost(rest)
610         (host, port) = urllib.splitport(host)
611         if port == None:
612             port = 80
613         try:
614             conn = httplib.HTTPConnection("%s:%s" %(host, port))
615             conn.request(method, path)
616             response = conn.getresponse()
617             if response.status in [301, 302, 303, 307]:
618                 headers = response.getheaders()
619                 for header in headers:
620                     if header[0] == "location":
621                         url = header[1]
622             elif response.status == 200:
623                 return response
624         except:
625             pass
626         redirectcount = redirectcount + 1
627     return None
628
629 def parse_and_deliver(maildir, url, statedir):
630     feedhandle = None
631     headers = None
632     # first check if we know about this feed already
633     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
634     if feeddb.has_key(url):
635         data = feeddb[url]
636         data = cgi.parse_qs(data)
637         response = open_url("HEAD", url)
638         headers = None
639         if response:
640             headers = response.getheaders()
641         ischanged = False
642         try:
643             for header in headers:
644                 if header[0] == "content-length":
645                     if header[1] != data["content-length"][0]:
646                         ischanged = True
647                 elif header[0] == "etag":
648                     if header[1] != data["etag"][0]:
649                         ischanged = True
650                 elif header[0] == "last-modified":
651                     if header[1] != data["last-modified"][0]:
652                         ischanged = True
653                 elif header[0] == "content-md5":
654                     if header[1] != data["content-md5"][0]:
655                         ischanged = True
656         except:
657             ischanged = True
658         if ischanged:
659             response = open_url("GET", url)
660             if response != None:
661                 headers = response.getheaders()
662                 feedhandle = response
663             else:
664                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
665                 return
666         else:
667             return # don't need to do anything, nothings changed.
668     else:
669         response = open_url("GET", url)
670         if response != None:
671             headers = response.getheaders()
672             feedhandle = response
673         else:
674             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
675             return
676
677     fp = feedparser.parse(feedhandle)
678     db = dbm.open(os.path.join(statedir, "seen"), "c")
679     for item in fp["items"]:
680         # have we seen it before?
681         # need to work out what the content is first...
682
683         if item.has_key("content"):
684             content = item["content"][0]["value"]
685         else:
686             if item.has_key("description"):
687                 content = item["description"]
688             else:
689                 content = u''
690
691         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
692
693         prevmessageid = None
694
695         db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
696         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
697
698         # check if there's a guid too - if that exists and we match the md5,
699         # return
700         if item.has_key("guid"):
701             if db.has_key(db_guid_key):
702                 data = db[db_guid_key]
703                 data = cgi.parse_qs(data)
704                 if data["contentmd5"][0] == md5sum:
705                     continue
706
707         if db.has_key(db_link_key):
708             data = db[db_link_key]
709             data = cgi.parse_qs(data)
710             if data.has_key("message-id"):
711                 prevmessageid = data["message-id"][0]
712             if data["contentmd5"][0] == md5sum:
713                 continue
714
715         try:
716             author = item["author"]
717         except:
718             author = url
719
720         # create a basic email message
721         msg = MIMEMultipart("alternative")
722         messageid = "<" \
723             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
724             + "." \
725             + "".join( \
726                 [random.choice( \
727                     string.ascii_letters + string.digits \
728                     ) for a in range(0,6) \
729                 ]) + "@" + socket.gethostname() + ">"
730         msg.add_header("Message-ID", messageid)
731         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
732         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
733         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
734         if prevmessageid:
735             msg.add_header("References", prevmessageid)
736         createddate = datetime.datetime.now() \
737             .strftime("%a, %e %b %Y %T -0000")
738         try:
739             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
740                 .strftime("%a, %e %b %Y %T -0000")
741         except:
742             pass
743         msg.add_header("Date", createddate)
744         subj_gen = HTML2Text()
745         title = item["title"]
746         title = re.sub(u'<', u'&lt;', title)
747         title = re.sub(u'>', u'&gt;', title)
748         subj_gen.feed(title.encode("utf-8"))
749         msg.add_header("Subject", subj_gen.gettext())
750         msg.set_default_type("text/plain")
751
752         htmlcontent = content.encode("utf-8")
753         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
754             content, \
755             item["link"], \
756             item["link"] )
757         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
758         textparser = HTML2Text()
759         textparser.feed(content.encode("utf-8"))
760         textcontent = textparser.gettext()
761         textcontent = "%s\n\nItem URL: %s" %( \
762             textcontent, \
763             item["link"] )
764         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
765         msg.attach(textpart)
766         msg.attach(htmlpart)
767
768         # start by working out the filename we should be writting to, we do
769         # this following the normal maildir style rules
770         fname = str(os.getpid()) \
771             + "." + socket.gethostname() \
772             + "." + "".join( \
773                 [random.choice( \
774                     string.ascii_letters + string.digits \
775                     ) for a in range(0,10) \
776                 ]) + "." \
777             + datetime.datetime.now().strftime('%s')
778         fn = os.path.join(maildir, "tmp", fname)
779         fh = open(fn, "w")
780         fh.write(msg.as_string())
781         fh.close()
782         # now move it in to the new directory
783         newfn = os.path.join(maildir, "new", fname)
784         os.link(fn, newfn)
785         os.unlink(fn)
786
787         # now add to the database about the item
788         if prevmessageid:
789             messageid = prevmessageid + " " + messageid
790         if item.has_key("guid") and item["guid"] != item["link"]:
791             data = urllib.urlencode(( \
792                 ("message-id", messageid), \
793                 ("created", createddate), \
794                 ("contentmd5", md5sum) \
795                 ))
796             db[db_guid_key] = data
797             try:
798                 data = db[db_link_key]
799                 data = cgi.parse_qs(data)
800                 newdata = urllib.urlencode(( \
801                     ("message-id", messageid), \
802                     ("created", data["created"][0]), \
803                     ("contentmd5", data["contentmd5"][0]) \
804                     ))
805                 db[db_link_key] = newdata
806             except:
807                 db[db_link_key] = data
808         else:
809             data = urllib.urlencode(( \
810                 ("message-id", messageid), \
811                 ("created", createddate), \
812                 ("contentmd5", md5sum) \
813                 ))
814             db[db_link_key] = data
815
816     if headers:
817         data = []
818         for header in headers:
819             if header[0] in \
820                 ["content-md5", "etag", "last-modified", "content-length"]:
821                 data.append((header[0], header[1]))
822         if len(data) > 0:
823             data = urllib.urlencode(data)
824             feeddb[url] = data
825
826     db.close()
827     feeddb.close()
828
829 if __name__ == "__main__":
830     # This only gets executed if we really called the program
831     # first off, parse the command line arguments
832
833     oparser = OptionParser()
834     oparser.add_option(
835         "-c", "--conf", dest="conf",
836         help="location of config file"
837         )
838     oparser.add_option(
839         "-s", "--statedir", dest="statedir",
840         help="location of directory to store state in"
841         )
842
843     (options, args) = oparser.parse_args()
844
845     # check for the configfile
846
847     configfile = None
848
849     if options.conf != None:
850         # does the file exist?
851         try:
852             os.stat(options.conf)
853             configfile = options.conf
854         except:
855             # should exit here as the specified file doesn't exist
856             sys.stderr.write( \
857                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
858             sys.exit(2)
859     else:
860         # check through the default locations
861         try:
862             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
863             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
864         except:
865             try:
866                 os.stat("/etc/rss2maildir.conf")
867                 configfile = "/etc/rss2maildir.conf"
868             except:
869                 sys.stderr.write("No config file found. Exiting.\n")
870                 sys.exit(2)
871
872     # Right - if we've got this far, we've got a config file, now for the hard
873     # bits...
874
875     scp = SafeConfigParser()
876     scp.read(configfile)
877
878     maildir_root = "RSSMaildir"
879     state_dir = "state"
880
881     if options.statedir != None:
882         state_dir = options.statedir
883         try:
884             mode = os.stat(state_dir)[stat.ST_MODE]
885             if not stat.S_ISDIR(mode):
886                 sys.stderr.write( \
887                     "State directory (%s) is not a directory\n" %(state_dir))
888                 sys.exit(1)
889         except:
890             # try to make the directory
891             try:
892                 os.mkdir(state_dir)
893             except:
894                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
895                 sys.exit(1)
896     elif scp.has_option("general", "state_dir"):
897         new_state_dir = scp.get("general", "state_dir")
898         try:
899             mode = os.stat(new_state_dir)[stat.ST_MODE]
900             if not stat.S_ISDIR(mode):
901                 sys.stderr.write( \
902                     "State directory (%s) is not a directory\n" %(state_dir))
903                 sys.exit(1)
904             else:
905                 state_dir = new_state_dir
906         except:
907             # try to create it
908             try:
909                 os.mkdir(new_state_dir)
910                 state_dir = new_state_dir
911             except:
912                 sys.stderr.write( \
913                     "Couldn't create state directory %s\n" %(new_state_dir))
914                 sys.exit(1)
915     else:
916         try:
917             mode = os.stat(state_dir)[stat.ST_MODE]
918             if not stat.S_ISDIR(mode):
919                 sys.stderr.write( \
920                     "State directory %s is not a directory\n" %(state_dir))
921                 sys.exit(1)
922         except:
923             try:
924                 os.mkdir(state_dir)
925             except:
926                 sys.stderr.write( \
927                     "State directory %s could not be created\n" %(state_dir))
928                 sys.exit(1)
929
930     if scp.has_option("general", "maildir_root"):
931         maildir_root = scp.get("general", "maildir_root")
932
933     try:
934         mode = os.stat(maildir_root)[stat.ST_MODE]
935         if not stat.S_ISDIR(mode):
936             sys.stderr.write( \
937                 "Maildir Root %s is not a directory\n" \
938                 %(maildir_root))
939             sys.exit(1)
940     except:
941         try:
942             os.mkdir(maildir_root)
943         except:
944             sys.stderr.write("Couldn't create Maildir Root %s\n" \
945                 %(maildir_root))
946             sys.exit(1)
947
948     feeds = scp.sections()
949     try:
950         feeds.remove("general")
951     except:
952         pass
953
954     for section in feeds:
955         # check if the directory exists
956         maildir = None
957         try:
958             maildir = scp.get(section, "maildir")
959         except:
960             maildir = section
961
962         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
963         maildir = os.path.join(maildir_root, maildir)
964
965         try:
966             exists = os.stat(maildir)
967             if stat.S_ISDIR(exists[stat.ST_MODE]):
968                 # check if there's a new, cur and tmp directory
969                 try:
970                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
971                 except:
972                     os.mkdir(os.path.join(maildir, "cur"))
973                     if not stat.S_ISDIR(mode):
974                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
975                 try:
976                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
977                 except:
978                     os.mkdir(os.path.join(maildir, "tmp"))
979                     if not stat.S_ISDIR(mode):
980                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
981                 try:
982                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
983                     if not stat.S_ISDIR(mode):
984                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
985                 except:
986                     os.mkdir(os.path.join(maildir, "new"))
987             else:
988                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
989         except:
990             try:
991                 os.mkdir(maildir)
992             except:
993                 sys.stderr.write("Couldn't create root maildir %s\n" \
994                     %(maildir))
995                 sys.exit(1)
996             try:
997                 os.mkdir(os.path.join(maildir, "new"))
998                 os.mkdir(os.path.join(maildir, "cur"))
999                 os.mkdir(os.path.join(maildir, "tmp"))
1000             except:
1001                 sys.stderr.write( \
1002                     "Couldn't create required maildir directories for %s\n" \
1003                     %(section,))
1004                 sys.exit(1)
1005
1006         # right - we've got the directories, we've got the section, we know the
1007         # url... lets play!
1008
1009         parse_and_deliver(maildir, section, state_dir)