a1e8819f8fcb067eff4cb892c70a460bc6b294c7
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
44     import hashlib as md5
45 else:
46     import md5
47
48 import cgi
49 import dbm
50
51 import re
52
53 from HTMLParser import HTMLParser
54
55 class HTML2Text(HTMLParser):
56     entities = {
57         u'amp': u'&',
58         u'lt': u'<',
59         u'gt': u'>',
60         u'pound': u'£',
61         u'copy': u'©',
62         u'apos': u'\'',
63         u'quot': u'"',
64         u'nbsp': u' ',
65         u'ldquo': u'“',
66         u'rdquo': u'”',
67         u'lsquo': u'‘',
68         u'rsquo': u'’',
69         u'laquo': u'«',
70         u'raquo': u'»',
71         u'lsaquo': u'‹',
72         u'rsaquo': u'›',
73         u'bull': u'•',
74         u'middot': u'·',
75         u'deg': u'°',
76         u'helip': u'…',
77         u'trade': u'™',
78         u'reg': u'®',
79         u'agrave': u'à',
80         u'Agrave': u'À',
81         u'egrave': u'è',
82         u'Egrave': u'È',
83         u'igrave': u'ì',
84         u'Igrave': u'Ì',
85         u'ograve': u'ò',
86         u'Ograve': u'Ò',
87         u'ugrave': u'ù',
88         u'Ugrave': u'Ù',
89         u'aacute': u'á',
90         u'Aacute': u'Á',
91         u'eacute': u'é',
92         u'Eacute': u'É',
93         u'iacute': u'í',
94         u'Iacute': u'Í',
95         u'oacute': u'ó',
96         u'Oacute': u'Ó',
97         u'uacute': u'ú',
98         u'Uacute': u'Ú',
99         u'yactue': u'ý',
100         u'Yacute': u'Ý',
101         u'acirc': u'â',
102         u'Acirc': u'Â',
103         u'ecirc': u'ê',
104         u'Ecirc': u'Ê',
105         u'icirc': u'î',
106         u'Icirc': u'Î',
107         u'ocirc': u'ô',
108         u'Ocirc': u'Ô',
109         u'ucirc': u'û',
110         u'Ucirc': u'Û',
111         u'atilde': u'ã',
112         u'Atilde': u'Ã',
113         u'ntilde': u'ñ',
114         u'Ntilde': u'Ñ',
115         u'otilde': u'õ',
116         u'Otilde': u'Õ',
117         u'auml': u'ä',
118         u'Auml': u'Ä',
119         u'euml': u'ë',
120         u'Euml': u'Ë',
121         u'iuml': u'ï',
122         u'Iuml': u'Ï',
123         u'ouml': u'ö',
124         u'Ouml': u'Ö',
125         u'uuml': u'ü',
126         u'Uuml': u'Ü',
127         u'yuml': u'ÿ',
128         u'Yuml': u'Ÿ',
129         u'iexcl': u'¡',
130         u'iquest': u'¿',
131         u'ccedil': u'ç',
132         u'Ccedil': u'Ç',
133         u'oelig': u'œ',
134         u'OElig': u'Œ',
135         u'szlig': u'ß',
136         u'oslash': u'ø',
137         u'Oslash': u'Ø',
138         u'aring': u'å',
139         u'Aring': u'Å',
140         u'aelig': u'æ',
141         u'AElig': u'Æ',
142         u'thorn': u'þ',
143         u'THORN': u'Þ',
144         u'eth': u'ð',
145         u'ETH': u'Ð',
146         u'mdash': u'—',
147         u'ndash': u'–',
148         u'sect': u'§',
149         u'para': u'¶',
150         u'uarr': u'↑',
151         u'darr': u'↓',
152         u'larr': u'←',
153         u'rarr': u'→',
154         u'dagger': u'†',
155         u'Dagger': u'‡',
156         u'permil': u'‰',
157         u'prod': u'∏',
158         u'infin': u'∞',
159         u'radic': u'√',
160         u'there4': u'∴',
161         u'int': u'∫',
162         u'asymp': u'≈',
163         u'ne': u'≠',
164         u'equiv': '≡',
165         u'le': u'≤',
166         u'ge': u'≥',
167         u'loz': u'⋄',
168         u'sum': u'∑',
169         u'part': u'∂',
170         u'prime': u'′',
171         u'Prime': u'″',
172         u'harr': u'↔',
173         u'micro': u'µ',
174         u'not': u'¬',
175         u'plusmn': u'±',
176         u'divide': u'÷',
177         u'cent': u'¢',
178         u'euro': u'€',
179         }
180
181     blockleveltags = [
182         u'h1',
183         u'h2',
184         u'h3',
185         u'h4',
186         u'h5',
187         u'h6',
188         u'pre',
189         u'p',
190         u'ul',
191         u'ol',
192         u'dl',
193         u'li',
194         u'dt',
195         u'dd',
196         u'div',
197         u'blockquote',
198         ]
199
200     liststarttags = [
201         u'ul',
202         u'ol',
203         u'dl',
204         ]
205
206     cancontainflow = [
207         u'div',
208         u'li',
209         u'dd',
210         u'blockquote',
211     ]
212
213     def __init__(self,textwidth=70):
214         self.text = u''
215         self.curdata = u''
216         self.textwidth = textwidth
217         self.opentags = []
218         self.indentlevel = 0
219         self.ignorenodata = False
220         self.listcount = []
221         self.urls = []
222         self.images = {}
223         HTMLParser.__init__(self)
224
225     def handle_starttag(self, tag, attrs):
226         tag_name = tag.lower()
227         if tag_name in self.blockleveltags:
228             # handle starting a new block - unless we're in a block element
229             # that can contain other blocks, we'll assume that we want to close
230             # the container
231             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232                 self.handle_curdata()
233
234             if tag_name == u'ol':
235                 self.handle_curdata()
236                 self.listcount.append(1)
237                 self.listlevel = len(self.listcount) - 1
238
239             if tag_name == u'dl':
240                 self.indentlevel = self.indentlevel + 4
241
242             if tag_name in self.liststarttags:
243                 smallist = self.opentags[-3:-1]
244                 smallist.reverse()
245                 for prev_listtag in smallist:
246                     if prev_listtag in [u'dl', u'ol']:
247                         self.indentlevel = self.indentlevel + 4
248                         break
249                     elif prev_listtag == u'ul':
250                         self.indentlevel = self.indentlevel + 3
251                         break
252
253             if len(self.opentags) > 0:
254                 self.handle_curdata()
255                 if tag_name not in self.cancontainflow:
256                     self.opentags.pop()
257             self.opentags.append(tag_name)
258         else:
259             if tag_name == "span":
260                 return
261             listcount = 0
262             try:
263                 listcount = self.listcount[-1]
264             except:
265                 pass
266
267             if tag_name == u'dd' and len(self.opentags) > 1 \
268                 and self.opentags[-1] == u'dt':
269                 self.handle_curdata()
270                 self.opentags.pop()
271             elif tag_name == u'dt' and len(self.opentags) > 1 \
272                 and self.opentags[-1] == u'dd':
273                 self.handle_curdata()
274                 self.opentags.pop()
275             elif tag_name == u'a':
276                 for attr in attrs:
277                     if attr[0].lower() == u'href':
278                         self.urls.append(attr[1].decode('utf-8'))
279                 self.curdata = self.curdata + u'`'
280                 self.opentags.append(tag_name)
281                 return
282             elif tag_name == u'img':
283                 self.handle_image(attrs)
284                 return
285             elif tag_name == u'br':
286                 self.handle_br()
287                 return
288             else:
289                 # we don't know the tag, so lets avoid handling it!
290                 return 
291
292     def handle_startendtag(self, tag, attrs):
293         if tag.lower() == u'br':
294             self.handle_br()
295         elif tag.lower() == u'img':
296             self.handle_image(attrs)
297             return
298
299     def handle_br(self):
300             self.handle_curdata()
301             self.opentags.append(u'br')
302             self.handle_curdata()
303             self.opentags.pop()
304
305     def handle_image(self, attrs):
306         alt = u''
307         url = u''
308         for attr in attrs:
309             if attr[0] == 'alt':
310                 alt = attr[1].decode('utf-8')
311             elif attr[0] == 'src':
312                 url = attr[1].decode('utf-8')
313         if url:
314             if alt:
315                 if self.images.has_key(alt):
316                     if self.images[alt]["url"] == url:
317                         self.curdata = self.curdata \
318                             + u'|%s|' %(alt,)
319                     else:
320                         while self.images.has_key(alt):
321                             alt = alt + "_"
322                         self.images[alt] = {"url": url}
323                         self.curdata = self.curdata \
324                             + u'|%s|' %(alt,)
325                 else:
326                     self.images[alt] = {"url": url}
327                     self.curdata = self.curdata \
328                         + u'|%s|' %(alt,)
329             else:
330                 if self.images.has_key(url):
331                     self.curdata = self.curdata \
332                         + u'|%s|' %(url,)
333                 else:
334                     self.images[url] = {}
335                     self.images[url]["url"] =url
336                     self.curdata = self.curdata \
337                         + u'|%s|' %(url,)
338
339     def handle_curdata(self):
340
341         if len(self.opentags) == 0:
342             return
343
344         tag_thats_done = self.opentags[-1]
345
346         if len(self.curdata) == 0:
347             return
348
349         if tag_thats_done == u'br':
350             if len(self.text) == 0 or self.text[-1] != '\n':
351                 self.text = self.text + '\n'
352                 self.ignorenodata = True
353             return
354
355         if len(self.curdata.strip()) == 0:
356             return
357
358         if tag_thats_done in self.blockleveltags:
359             newlinerequired = self.text != u''
360             if self.ignorenodata:
361                 newlinerequired = False
362             self.ignorenodata = False
363             if newlinerequired:
364                 if tag_thats_done in [u'dt', u'dd', u'li'] \
365                     and len(self.text) > 1 \
366                     and self.text[-1] != u'\n':
367                         self.text = self.text + u'\n'
368                 elif len(self.text) > 2 \
369                     and self.text[-1] != u'\n' \
370                     and self.text[-2] != u'\n':
371                     self.text = self.text + u'\n\n'
372
373         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
374             underline = u''
375             underlinechar = u'='
376             headingtext = " ".join(self.curdata.split())
377             seperator = u'\n' + u' '*self.indentlevel
378             headingtext = seperator.join( \
379                 textwrap.wrap( \
380                     headingtext, \
381                     self.textwidth - self.indentlevel \
382                     ) \
383                 )
384
385             if tag_thats_done == u'h2':
386                 underlinechar = u'-'
387             elif tag_thats_done != u'h1':
388                 underlinechar = u'~'
389
390             if u'\n' in headingtext:
391                 underline = u' ' * self.indentlevel \
392                     + underlinechar * (self.textwidth - self.indentlevel)
393             else:
394                 underline = u' ' * self.indentlevel \
395                     + underlinechar * len(headingtext)
396             self.text = self.text \
397                 + headingtext + u'\n' \
398                 + underline
399         elif tag_thats_done in [u'p', u'div']:
400             paragraph = unicode( \
401                 " ".join(self.curdata.strip().encode("utf-8").split()), \
402                 "utf-8")
403             seperator = u'\n' + u' ' * self.indentlevel
404             self.text = self.text \
405                 + u' ' * self.indentlevel \
406                 + seperator.join( \
407                     textwrap.wrap( \
408                         paragraph, self.textwidth - self.indentlevel))
409         elif tag_thats_done == "pre":
410             self.text = self.text + unicode( \
411                 self.curdata.encode("utf-8"), "utf-8")
412         elif tag_thats_done == u'blockquote':
413             quote = unicode( \
414                 " ".join(self.curdata.encode("utf-8").strip().split()), \
415                 "utf-8")
416             seperator = u'\n' + u' ' * self.indentlevel + u'    '
417             if len(self.text) > 0 and self.text[-1] != u'\n':
418                 self.text = self.text + u'\n'
419             self.text = self.text \
420                 + u'    ' \
421                 + seperator.join( \
422                     textwrap.wrap( \
423                         quote, \
424                         self.textwidth - self.indentlevel - 2 \
425                     )
426                 )
427             self.curdata = u''
428         elif tag_thats_done == "li":
429             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
430             if len(self.text) > 0 and self.text[-1] != u'\n':
431                 self.text = self.text + u'\n'
432             # work out if we're in an ol rather than a ul
433             latesttags = self.opentags[-4:]
434             latesttags.reverse()
435             isul = None
436             for thing in latesttags:
437                 if thing == 'ul':
438                     isul = True
439                     break
440                 elif thing == 'ol':
441                     isul = False
442                     break
443
444             listindent = 3
445             if not isul:
446                 listindent = 4
447
448             listmarker = u' * '
449             if isul == False:
450                 listmarker = u' %2d. ' %(self.listcount[-1])
451                 self.listcount[-1] = self.listcount[-1] + 1
452
453             seperator = u'\n' \
454                 + u' ' * self.indentlevel \
455                 + u' ' * listindent
456             self.text = self.text \
457                 + u' ' * self.indentlevel \
458                 + listmarker \
459                 + seperator.join( \
460                     textwrap.wrap( \
461                         item, \
462                         self.textwidth - self.indentlevel - listindent \
463                     ) \
464                 )
465             self.curdata = u''
466         elif tag_thats_done == u'dt':
467             definition = unicode(" ".join( \
468                     self.curdata.encode("utf-8").strip().split()), \
469                 "utf-8")
470             if len(self.text) > 0 and self.text[-1] != u'\n':
471                 self.text = self.text + u'\n\n'
472             elif len(self.text) > 1 and self.text[-2] != u'\n':
473                 self.text = self.text + u'\n'
474             definition = u' ' * (self.indentlevel - 4) + definition + "::"
475             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
476             self.text = self.text \
477                 + indentstring.join(
478                     textwrap.wrap(definition, \
479                         self.textwidth - self.indentlevel - 4))
480             self.curdata = u''
481         elif tag_thats_done == u'dd':
482             definition = unicode(" ".join( \
483                     self.curdata.encode("utf-8").strip().split()),
484                 "utf-8")
485             if len(definition) > 0:
486                 if len(self.text) > 0 and self.text[-1] != u'\n':
487                     self.text = self.text + u'\n'
488                 indentstring = u'\n' + u' ' * self.indentlevel
489                 self.text = self.text \
490                     + indentstring \
491                     + indentstring.join( \
492                         textwrap.wrap( \
493                             definition, \
494                             self.textwidth - self.indentlevel \
495                             ) \
496                         )
497                 self.curdata = u''
498         elif tag_thats_done == u'a':
499             self.curdata = self.curdata + u'`__'
500             pass
501         elif tag_thats_done in self.liststarttags:
502             pass
503
504         if tag_thats_done in self.blockleveltags:
505             self.curdata = u''
506
507         self.ignorenodata = False
508
509     def handle_endtag(self, tag):
510         self.ignorenodata = False
511         if tag == "span":
512             return
513
514         try:
515             tagindex = self.opentags.index(tag)
516         except:
517             return
518         tag = tag.lower()
519
520         if tag in [u'br', u'img']:
521             return
522
523         if tag == u'dl':
524             self.indentlevel = self.indentlevel - 4
525
526         if tag in self.liststarttags:
527             if tag in [u'ol', u'dl', u'ul', u'dd']:
528                 self.handle_curdata()
529                 # find if there was a previous list level
530                 smalllist = self.opentags[:-1]
531                 smalllist.reverse()
532                 for prev_listtag in smalllist:
533                     if prev_listtag in [u'ol', u'dl']:
534                         self.indentlevel = self.indentlevel - 4
535                         break
536                     elif prev_listtag == u'ul':
537                         self.indentlevel = self.indentlevel - 3
538                         break
539
540         if tag == u'ol':
541             self.listcount = self.listcount[:-1]
542
543         while tagindex < len(self.opentags) \
544             and tag in self.opentags[tagindex+1:]:
545             try:
546                 tagindex = self.opentags.index(tag, tagindex+1)
547             except:
548                 # well, we don't want to do that then
549                 pass
550         if tagindex != len(self.opentags) - 1:
551             # Assuming the data was for the last opened tag first
552             self.handle_curdata()
553             # Now kill the list to be a slice before this tag was opened
554             self.opentags = self.opentags[:tagindex + 1]
555         else:
556             self.handle_curdata()
557             if self.opentags[-1] == tag:
558                 self.opentags.pop()
559
560     def handle_data(self, data):
561         if len(self.opentags) == 0:
562             self.opentags.append(u'p')
563         self.curdata = self.curdata + data.decode("utf-8")
564
565     def handle_charref(self, name):
566         try:
567             entity = unichr(int(name))
568         except:
569             if name[0] == 'x':
570                 try:
571                     entity = unichr(int('0%s' %(name,), 16))
572                 except:
573                     entity = u'#%s' %(name,)
574             else:
575                 entity = u'#%s' %(name,)
576         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
577             "utf-8")
578
579     def handle_entityref(self, name):
580         entity = name
581         if HTML2Text.entities.has_key(name):
582             entity = HTML2Text.entities[name]
583         else:
584             entity = "&" + name + ";"
585
586         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
587             "utf-8")
588
589     def gettext(self):
590         self.handle_curdata()
591         if len(self.text) == 0 or self.text[-1] != u'\n':
592             self.text = self.text + u'\n'
593         self.opentags = []
594         if len(self.text) > 0:
595             while len(self.text) > 1 and self.text[-1] == u'\n':
596                 self.text = self.text[:-1]
597             self.text = self.text + u'\n'
598         if len(self.urls) > 0:
599             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
600             self.urls = []
601         if len(self.images.keys()) > 0:
602             self.text = self.text + u'\n.. ' \
603                 + u'\n.. '.join( \
604                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
605                 for a in self.images.keys()]) + u'\n'
606             self.images = {}
607         return self.text
608
609 def open_url(method, url):
610     redirectcount = 0
611     while redirectcount < 3:
612         (type, rest) = urllib.splittype(url)
613         (host, path) = urllib.splithost(rest)
614         (host, port) = urllib.splitport(host)
615         if port == None:
616             port = 80
617         try:
618             conn = httplib.HTTPConnection("%s:%s" %(host, port))
619             conn.request(method, path)
620             response = conn.getresponse()
621             if response.status in [301, 302, 303, 307]:
622                 headers = response.getheaders()
623                 for header in headers:
624                     if header[0] == "location":
625                         url = header[1]
626             elif response.status == 200:
627                 return response
628         except:
629             pass
630         redirectcount = redirectcount + 1
631     return None
632
633 def parse_and_deliver(maildir, url, statedir):
634     feedhandle = None
635     headers = None
636     # first check if we know about this feed already
637     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
638     if feeddb.has_key(url):
639         data = feeddb[url]
640         data = cgi.parse_qs(data)
641         response = open_url("HEAD", url)
642         headers = None
643         if response:
644             headers = response.getheaders()
645         ischanged = False
646         try:
647             for header in headers:
648                 if header[0] == "content-length":
649                     if header[1] != data["content-length"][0]:
650                         ischanged = True
651                 elif header[0] == "etag":
652                     if header[1] != data["etag"][0]:
653                         ischanged = True
654                 elif header[0] == "last-modified":
655                     if header[1] != data["last-modified"][0]:
656                         ischanged = True
657                 elif header[0] == "content-md5":
658                     if header[1] != data["content-md5"][0]:
659                         ischanged = True
660         except:
661             ischanged = True
662         if ischanged:
663             response = open_url("GET", url)
664             if response != None:
665                 headers = response.getheaders()
666                 feedhandle = response
667             else:
668                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
669                 return
670         else:
671             return # don't need to do anything, nothings changed.
672     else:
673         response = open_url("GET", url)
674         if response != None:
675             headers = response.getheaders()
676             feedhandle = response
677         else:
678             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
679             return
680
681     fp = feedparser.parse(feedhandle)
682     db = dbm.open(os.path.join(statedir, "seen"), "c")
683     for item in fp["items"]:
684         # have we seen it before?
685         # need to work out what the content is first...
686
687         if item.has_key("content"):
688             content = item["content"][0]["value"]
689         else:
690             if item.has_key("description"):
691                 content = item["description"]
692             else:
693                 content = u''
694
695         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
696
697         prevmessageid = None
698
699         db_guid_key = None
700         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
701
702         # check if there's a guid too - if that exists and we match the md5,
703         # return
704         if item.has_key("guid"):
705             db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
706             if db.has_key(db_guid_key):
707                 data = db[db_guid_key]
708                 data = cgi.parse_qs(data)
709                 if data["contentmd5"][0] == md5sum:
710                     continue
711
712         if db.has_key(db_link_key):
713             data = db[db_link_key]
714             data = cgi.parse_qs(data)
715             if data.has_key("message-id"):
716                 prevmessageid = data["message-id"][0]
717             if data["contentmd5"][0] == md5sum:
718                 continue
719
720         try:
721             author = item["author"]
722         except:
723             author = url
724
725         # create a basic email message
726         msg = MIMEMultipart("alternative")
727         messageid = "<" \
728             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
729             + "." \
730             + "".join( \
731                 [random.choice( \
732                     string.ascii_letters + string.digits \
733                     ) for a in range(0,6) \
734                 ]) + "@" + socket.gethostname() + ">"
735         msg.add_header("Message-ID", messageid)
736         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
737         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
738         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
739         if prevmessageid:
740             msg.add_header("References", prevmessageid)
741         createddate = datetime.datetime.now() \
742             .strftime("%a, %e %b %Y %T -0000")
743         try:
744             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
745                 .strftime("%a, %e %b %Y %T -0000")
746         except:
747             pass
748         msg.add_header("Date", createddate)
749         msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
750             .strftime("%a, %e %b %Y %T -0000"))
751         subj_gen = HTML2Text()
752         title = item["title"]
753         title = re.sub(u'<', u'&lt;', title)
754         title = re.sub(u'>', u'&gt;', title)
755         subj_gen.feed(title.encode("utf-8"))
756         msg.add_header("Subject", subj_gen.gettext())
757         msg.set_default_type("text/plain")
758
759         htmlcontent = content.encode("utf-8")
760         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
761             content, \
762             item["link"], \
763             item["link"] )
764         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
765         textparser = HTML2Text()
766         textparser.feed(content.encode("utf-8"))
767         textcontent = textparser.gettext()
768         textcontent = "%s\n\nItem URL: %s" %( \
769             textcontent, \
770             item["link"] )
771         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
772         msg.attach(textpart)
773         msg.attach(htmlpart)
774
775         # start by working out the filename we should be writting to, we do
776         # this following the normal maildir style rules
777         fname = str(os.getpid()) \
778             + "." + socket.gethostname() \
779             + "." + "".join( \
780                 [random.choice( \
781                     string.ascii_letters + string.digits \
782                     ) for a in range(0,10) \
783                 ]) + "." \
784             + datetime.datetime.now().strftime('%s')
785         fn = os.path.join(maildir, "tmp", fname)
786         fh = open(fn, "w")
787         fh.write(msg.as_string())
788         fh.close()
789         # now move it in to the new directory
790         newfn = os.path.join(maildir, "new", fname)
791         os.link(fn, newfn)
792         os.unlink(fn)
793
794         # now add to the database about the item
795         if prevmessageid:
796             messageid = prevmessageid + " " + messageid
797         if item.has_key("guid") and item["guid"] != item["link"]:
798             data = urllib.urlencode(( \
799                 ("message-id", messageid), \
800                 ("created", createddate), \
801                 ("contentmd5", md5sum) \
802                 ))
803             db[db_guid_key] = data
804             try:
805                 data = db[db_link_key]
806                 data = cgi.parse_qs(data)
807                 newdata = urllib.urlencode(( \
808                     ("message-id", messageid), \
809                     ("created", data["created"][0]), \
810                     ("contentmd5", data["contentmd5"][0]) \
811                     ))
812                 db[db_link_key] = newdata
813             except:
814                 db[db_link_key] = data
815         else:
816             data = urllib.urlencode(( \
817                 ("message-id", messageid), \
818                 ("created", createddate), \
819                 ("contentmd5", md5sum) \
820                 ))
821             db[db_link_key] = data
822
823     if headers:
824         data = []
825         for header in headers:
826             if header[0] in \
827                 ["content-md5", "etag", "last-modified", "content-length"]:
828                 data.append((header[0], header[1]))
829         if len(data) > 0:
830             data = urllib.urlencode(data)
831             feeddb[url] = data
832
833     db.close()
834     feeddb.close()
835
836 if __name__ == "__main__":
837     # This only gets executed if we really called the program
838     # first off, parse the command line arguments
839
840     oparser = OptionParser()
841     oparser.add_option(
842         "-c", "--conf", dest="conf",
843         help="location of config file"
844         )
845     oparser.add_option(
846         "-s", "--statedir", dest="statedir",
847         help="location of directory to store state in"
848         )
849
850     (options, args) = oparser.parse_args()
851
852     # check for the configfile
853
854     configfile = None
855
856     if options.conf != None:
857         # does the file exist?
858         try:
859             os.stat(options.conf)
860             configfile = options.conf
861         except:
862             # should exit here as the specified file doesn't exist
863             sys.stderr.write( \
864                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
865             sys.exit(2)
866     else:
867         # check through the default locations
868         try:
869             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
870             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
871         except:
872             try:
873                 os.stat("/etc/rss2maildir.conf")
874                 configfile = "/etc/rss2maildir.conf"
875             except:
876                 sys.stderr.write("No config file found. Exiting.\n")
877                 sys.exit(2)
878
879     # Right - if we've got this far, we've got a config file, now for the hard
880     # bits...
881
882     scp = SafeConfigParser()
883     scp.read(configfile)
884
885     maildir_root = "RSSMaildir"
886     state_dir = "state"
887
888     if options.statedir != None:
889         state_dir = options.statedir
890         try:
891             mode = os.stat(state_dir)[stat.ST_MODE]
892             if not stat.S_ISDIR(mode):
893                 sys.stderr.write( \
894                     "State directory (%s) is not a directory\n" %(state_dir))
895                 sys.exit(1)
896         except:
897             # try to make the directory
898             try:
899                 os.mkdir(state_dir)
900             except:
901                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
902                 sys.exit(1)
903     elif scp.has_option("general", "state_dir"):
904         new_state_dir = scp.get("general", "state_dir")
905         try:
906             mode = os.stat(new_state_dir)[stat.ST_MODE]
907             if not stat.S_ISDIR(mode):
908                 sys.stderr.write( \
909                     "State directory (%s) is not a directory\n" %(state_dir))
910                 sys.exit(1)
911             else:
912                 state_dir = new_state_dir
913         except:
914             # try to create it
915             try:
916                 os.mkdir(new_state_dir)
917                 state_dir = new_state_dir
918             except:
919                 sys.stderr.write( \
920                     "Couldn't create state directory %s\n" %(new_state_dir))
921                 sys.exit(1)
922     else:
923         try:
924             mode = os.stat(state_dir)[stat.ST_MODE]
925             if not stat.S_ISDIR(mode):
926                 sys.stderr.write( \
927                     "State directory %s is not a directory\n" %(state_dir))
928                 sys.exit(1)
929         except:
930             try:
931                 os.mkdir(state_dir)
932             except:
933                 sys.stderr.write( \
934                     "State directory %s could not be created\n" %(state_dir))
935                 sys.exit(1)
936
937     if scp.has_option("general", "maildir_root"):
938         maildir_root = scp.get("general", "maildir_root")
939
940     try:
941         mode = os.stat(maildir_root)[stat.ST_MODE]
942         if not stat.S_ISDIR(mode):
943             sys.stderr.write( \
944                 "Maildir Root %s is not a directory\n" \
945                 %(maildir_root))
946             sys.exit(1)
947     except:
948         try:
949             os.mkdir(maildir_root)
950         except:
951             sys.stderr.write("Couldn't create Maildir Root %s\n" \
952                 %(maildir_root))
953             sys.exit(1)
954
955     feeds = scp.sections()
956     try:
957         feeds.remove("general")
958     except:
959         pass
960
961     for section in feeds:
962         # check if the directory exists
963         maildir = None
964         try:
965             maildir = scp.get(section, "maildir")
966         except:
967             maildir = section
968
969         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
970         maildir = os.path.join(maildir_root, maildir)
971
972         try:
973             exists = os.stat(maildir)
974             if stat.S_ISDIR(exists[stat.ST_MODE]):
975                 # check if there's a new, cur and tmp directory
976                 try:
977                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
978                 except:
979                     os.mkdir(os.path.join(maildir, "cur"))
980                     if not stat.S_ISDIR(mode):
981                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
982                 try:
983                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
984                 except:
985                     os.mkdir(os.path.join(maildir, "tmp"))
986                     if not stat.S_ISDIR(mode):
987                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
988                 try:
989                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
990                     if not stat.S_ISDIR(mode):
991                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
992                 except:
993                     os.mkdir(os.path.join(maildir, "new"))
994             else:
995                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
996         except:
997             try:
998                 os.mkdir(maildir)
999             except:
1000                 sys.stderr.write("Couldn't create root maildir %s\n" \
1001                     %(maildir))
1002                 sys.exit(1)
1003             try:
1004                 os.mkdir(os.path.join(maildir, "new"))
1005                 os.mkdir(os.path.join(maildir, "cur"))
1006                 os.mkdir(os.path.join(maildir, "tmp"))
1007             except:
1008                 sys.stderr.write( \
1009                     "Couldn't create required maildir directories for %s\n" \
1010                     %(section,))
1011                 sys.exit(1)
1012
1013         # right - we've got the directories, we've got the section, we know the
1014         # url... lets play!
1015
1016         parse_and_deliver(maildir, section, state_dir)