b4acdec086d924bb9175c51dca551caf9ab07913
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
44     import hashlib as md5
45 else:
46     import md5
47
48 import cgi
49 import dbm
50
51 import re
52
53 from HTMLParser import HTMLParser
54
55 class HTML2Text(HTMLParser):
56     entities = {
57         u'amp': u'&',
58         u'lt': u'<',
59         u'gt': u'>',
60         u'pound': u'£',
61         u'copy': u'©',
62         u'apos': u'\'',
63         u'quot': u'"',
64         u'nbsp': u' ',
65         u'ldquo': u'“',
66         u'rdquo': u'”',
67         u'lsquo': u'‘',
68         u'rsquo': u'’',
69         u'laquo': u'«',
70         u'raquo': u'»',
71         u'lsaquo': u'‹',
72         u'rsaquo': u'›',
73         u'bull': u'•',
74         u'middot': u'·',
75         u'deg': u'°',
76         u'helip': u'…',
77         u'trade': u'™',
78         u'reg': u'®',
79         u'agrave': u'à',
80         u'Agrave': u'À',
81         u'egrave': u'è',
82         u'Egrave': u'È',
83         u'igrave': u'ì',
84         u'Igrave': u'Ì',
85         u'ograve': u'ò',
86         u'Ograve': u'Ò',
87         u'ugrave': u'ù',
88         u'Ugrave': u'Ù',
89         u'aacute': u'á',
90         u'Aacute': u'Á',
91         u'eacute': u'é',
92         u'Eacute': u'É',
93         u'iacute': u'í',
94         u'Iacute': u'Í',
95         u'oacute': u'ó',
96         u'Oacute': u'Ó',
97         u'uacute': u'ú',
98         u'Uacute': u'Ú',
99         u'yactue': u'ý',
100         u'Yacute': u'Ý',
101         u'acirc': u'â',
102         u'Acirc': u'Â',
103         u'ecirc': u'ê',
104         u'Ecirc': u'Ê',
105         u'icirc': u'î',
106         u'Icirc': u'Î',
107         u'ocirc': u'ô',
108         u'Ocirc': u'Ô',
109         u'ucirc': u'û',
110         u'Ucirc': u'Û',
111         u'atilde': u'ã',
112         u'Atilde': u'Ã',
113         u'ntilde': u'ñ',
114         u'Ntilde': u'Ñ',
115         u'otilde': u'õ',
116         u'Otilde': u'Õ',
117         u'auml': u'ä',
118         u'Auml': u'Ä',
119         u'euml': u'ë',
120         u'Euml': u'Ë',
121         u'iuml': u'ï',
122         u'Iuml': u'Ï',
123         u'ouml': u'ö',
124         u'Ouml': u'Ö',
125         u'uuml': u'ü',
126         u'Uuml': u'Ü',
127         u'yuml': u'ÿ',
128         u'Yuml': u'Ÿ',
129         u'iexcl': u'¡',
130         u'iquest': u'¿',
131         u'ccedil': u'ç',
132         u'Ccedil': u'Ç',
133         u'oelig': u'œ',
134         u'OElig': u'Œ',
135         u'szlig': u'ß',
136         u'oslash': u'ø',
137         u'Oslash': u'Ø',
138         u'aring': u'å',
139         u'Aring': u'Å',
140         u'aelig': u'æ',
141         u'AElig': u'Æ',
142         u'thorn': u'þ',
143         u'THORN': u'Þ',
144         u'eth': u'ð',
145         u'ETH': u'Ð',
146         u'mdash': u'—',
147         u'ndash': u'–',
148         u'sect': u'§',
149         u'para': u'¶',
150         u'uarr': u'↑',
151         u'darr': u'↓',
152         u'larr': u'←',
153         u'rarr': u'→',
154         u'dagger': u'†',
155         u'Dagger': u'‡',
156         u'permil': u'‰',
157         u'prod': u'∏',
158         u'infin': u'∞',
159         u'radic': u'√',
160         u'there4': u'∴',
161         u'int': u'∫',
162         u'asymp': u'≈',
163         u'ne': u'≠',
164         u'equiv': '≡',
165         u'le': u'≤',
166         u'ge': u'≥',
167         u'loz': u'⋄',
168         u'sum': u'∑',
169         u'part': u'∂',
170         u'prime': u'′',
171         u'Prime': u'″',
172         u'harr': u'↔',
173         u'micro': u'µ',
174         u'not': u'¬',
175         u'plusmn': u'±',
176         u'divide': u'÷',
177         u'cent': u'¢',
178         u'euro': u'€',
179         }
180
181     blockleveltags = [
182         u'h1',
183         u'h2',
184         u'h3',
185         u'h4',
186         u'h5',
187         u'h6',
188         u'pre',
189         u'p',
190         u'ul',
191         u'ol',
192         u'dl',
193         u'li',
194         u'dt',
195         u'dd',
196         u'div',
197         u'blockquote',
198         ]
199
200     liststarttags = [
201         u'ul',
202         u'ol',
203         u'dl',
204         ]
205
206     cancontainflow = [
207         u'div',
208         u'li',
209         u'dd',
210         u'blockquote',
211     ]
212
213     def __init__(self,textwidth=70):
214         self.text = u''
215         self.curdata = u''
216         self.textwidth = textwidth
217         self.opentags = []
218         self.indentlevel = 0
219         self.ignorenodata = False
220         self.listcount = []
221         self.urls = []
222         self.images = {}
223         HTMLParser.__init__(self)
224
225     def handle_starttag(self, tag, attrs):
226         tag_name = tag.lower()
227         if tag_name in self.blockleveltags:
228             # handle starting a new block - unless we're in a block element
229             # that can contain other blocks, we'll assume that we want to close
230             # the container
231             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232                 self.handle_curdata()
233
234             if tag_name == u'ol':
235                 self.handle_curdata()
236                 self.listcount.append(1)
237                 self.listlevel = len(self.listcount) - 1
238
239             if tag_name == u'dl':
240                 self.indentlevel = self.indentlevel + 4
241
242             if tag_name in self.liststarttags:
243                 smallist = self.opentags[-3:-1]
244                 smallist.reverse()
245                 for prev_listtag in smallist:
246                     if prev_listtag in [u'dl', u'ol']:
247                         self.indentlevel = self.indentlevel + 4
248                         break
249                     elif prev_listtag == u'ul':
250                         self.indentlevel = self.indentlevel + 3
251                         break
252
253             if len(self.opentags) > 0:
254                 self.handle_curdata()
255                 if tag_name not in self.cancontainflow:
256                     self.opentags.pop()
257             self.opentags.append(tag_name)
258         else:
259             if tag_name == "span":
260                 return
261             listcount = 0
262             try:
263                 listcount = self.listcount[-1]
264             except:
265                 pass
266
267             if tag_name == u'dd' and len(self.opentags) > 1 \
268                 and self.opentags[-1] == u'dt':
269                 self.handle_curdata()
270                 self.opentags.pop()
271             elif tag_name == u'dt' and len(self.opentags) > 1 \
272                 and self.opentags[-1] == u'dd':
273                 self.handle_curdata()
274                 self.opentags.pop()
275             elif tag_name == u'a':
276                 for attr in attrs:
277                     if attr[0].lower() == u'href':
278                         self.urls.append(attr[1].decode('utf-8'))
279                 self.curdata = self.curdata + u'`'
280                 self.opentags.append(tag_name)
281                 return
282             elif tag_name == u'img':
283                 self.handle_image(attrs)
284                 return
285             elif tag_name == u'br':
286                 self.handle_br()
287                 return
288             else:
289                 # we don't know the tag, so lets avoid handling it!
290                 return 
291
292     def handle_startendtag(self, tag, attrs):
293         if tag.lower() == u'br':
294             self.handle_br()
295         elif tag.lower() == u'img':
296             self.handle_image(attrs)
297             return
298
299     def handle_br(self):
300             self.handle_curdata()
301             self.opentags.append(u'br')
302             self.handle_curdata()
303             self.opentags.pop()
304
305     def handle_image(self, attrs):
306         alt = u''
307         url = u''
308         for attr in attrs:
309             if attr[0] == 'alt':
310                 alt = attr[1].decode('utf-8')
311             elif attr[0] == 'src':
312                 url = attr[1].decode('utf-8')
313         if url:
314             if alt:
315                 if self.images.has_key(alt):
316                     if self.images[alt]["url"] == url:
317                         self.curdata = self.curdata \
318                             + u'|%s|' %(alt,)
319                     else:
320                         while self.images.has_key(alt):
321                             alt = alt + "_"
322                         self.images[alt] = {"url": url}
323                         self.curdata = self.curdata \
324                             + u'|%s|' %(alt,)
325                 else:
326                     self.images[alt] = {"url": url}
327                     self.curdata = self.curdata \
328                         + u'|%s|' %(alt,)
329             else:
330                 if self.images.has_key(url):
331                     self.curdata = self.curdata \
332                         + u'|%s|' %(url,)
333                 else:
334                     self.images[url] = {}
335                     self.images[url]["url"] =url
336                     self.curdata = self.curdata \
337                         + u'|%s|' %(url,)
338
339     def handle_curdata(self):
340
341         if len(self.opentags) == 0:
342             return
343
344         tag_thats_done = self.opentags[-1]
345
346         if len(self.curdata) == 0:
347             return
348
349         if tag_thats_done == u'br':
350             if len(self.text) == 0 or self.text[-1] != '\n':
351                 self.text = self.text + '\n'
352                 self.ignorenodata = True
353             return
354
355         if len(self.curdata.strip()) == 0:
356             return
357
358         if tag_thats_done in self.blockleveltags:
359             newlinerequired = self.text != u''
360             if self.ignorenodata:
361                 newlinerequired = False
362             self.ignorenodata = False
363             if newlinerequired:
364                 if tag_thats_done in [u'dt', u'dd', u'li'] \
365                     and len(self.text) > 1 \
366                     and self.text[-1] != u'\n':
367                         self.text = self.text + u'\n'
368                 elif len(self.text) > 2 \
369                     and self.text[-1] != u'\n' \
370                     and self.text[-2] != u'\n':
371                     self.text = self.text + u'\n\n'
372
373         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
374             underline = u''
375             underlinechar = u'='
376             headingtext = " ".join(self.curdata.split())
377             seperator = u'\n' + u' '*self.indentlevel
378             headingtext = seperator.join( \
379                 textwrap.wrap( \
380                     headingtext, \
381                     self.textwidth - self.indentlevel \
382                     ) \
383                 )
384
385             if tag_thats_done == u'h2':
386                 underlinechar = u'-'
387             elif tag_thats_done != u'h1':
388                 underlinechar = u'~'
389
390             if u'\n' in headingtext:
391                 underline = u' ' * self.indentlevel \
392                     + underlinechar * (self.textwidth - self.indentlevel)
393             else:
394                 underline = u' ' * self.indentlevel \
395                     + underlinechar * len(headingtext)
396             self.text = self.text \
397                 + headingtext + u'\n' \
398                 + underline
399         elif tag_thats_done in [u'p', u'div']:
400             paragraph = unicode( \
401                 " ".join(self.curdata.strip().encode("utf-8").split()), \
402                 "utf-8")
403             seperator = u'\n' + u' ' * self.indentlevel
404             self.text = self.text \
405                 + u' ' * self.indentlevel \
406                 + seperator.join( \
407                     textwrap.wrap( \
408                         paragraph, self.textwidth - self.indentlevel))
409         elif tag_thats_done == "pre":
410             self.text = self.text + unicode( \
411                 self.curdata.encode("utf-8"), "utf-8")
412         elif tag_thats_done == u'blockquote':
413             quote = unicode( \
414                 " ".join(self.curdata.encode("utf-8").strip().split()), \
415                 "utf-8")
416             seperator = u'\n' + u' ' * self.indentlevel + u'    '
417             if len(self.text) > 0 and self.text[-1] != u'\n':
418                 self.text = self.text + u'\n'
419             self.text = self.text \
420                 + u'    ' \
421                 + seperator.join( \
422                     textwrap.wrap( \
423                         quote, \
424                         self.textwidth - self.indentlevel - 2 \
425                     )
426                 )
427             self.curdata = u''
428         elif tag_thats_done == "li":
429             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
430             if len(self.text) > 0 and self.text[-1] != u'\n':
431                 self.text = self.text + u'\n'
432             # work out if we're in an ol rather than a ul
433             latesttags = self.opentags[-4:]
434             latesttags.reverse()
435             isul = None
436             for thing in latesttags:
437                 if thing == 'ul':
438                     isul = True
439                     break
440                 elif thing == 'ol':
441                     isul = False
442                     break
443
444             listindent = 3
445             if not isul:
446                 listindent = 4
447
448             listmarker = u' * '
449             if isul == False:
450                 listmarker = u' %2d. ' %(self.listcount[-1])
451                 self.listcount[-1] = self.listcount[-1] + 1
452
453             seperator = u'\n' \
454                 + u' ' * self.indentlevel \
455                 + u' ' * listindent
456             self.text = self.text \
457                 + u' ' * self.indentlevel \
458                 + listmarker \
459                 + seperator.join( \
460                     textwrap.wrap( \
461                         item, \
462                         self.textwidth - self.indentlevel - listindent \
463                     ) \
464                 )
465             self.curdata = u''
466         elif tag_thats_done == u'dt':
467             definition = unicode(" ".join( \
468                     self.curdata.encode("utf-8").strip().split()), \
469                 "utf-8")
470             if len(self.text) > 0 and self.text[-1] != u'\n':
471                 self.text = self.text + u'\n\n'
472             elif len(self.text) > 1 and self.text[-2] != u'\n':
473                 self.text = self.text + u'\n'
474             definition = u' ' * (self.indentlevel - 4) + definition + "::"
475             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
476             self.text = self.text \
477                 + indentstring.join(
478                     textwrap.wrap(definition, \
479                         self.textwidth - self.indentlevel - 4))
480             self.curdata = u''
481         elif tag_thats_done == u'dd':
482             definition = unicode(" ".join( \
483                     self.curdata.encode("utf-8").strip().split()),
484                 "utf-8")
485             if len(definition) > 0:
486                 if len(self.text) > 0 and self.text[-1] != u'\n':
487                     self.text = self.text + u'\n'
488                 indentstring = u'\n' + u' ' * self.indentlevel
489                 self.text = self.text \
490                     + indentstring \
491                     + indentstring.join( \
492                         textwrap.wrap( \
493                             definition, \
494                             self.textwidth - self.indentlevel \
495                             ) \
496                         )
497                 self.curdata = u''
498         elif tag_thats_done == u'a':
499             self.curdata = self.curdata + u'`__'
500             pass
501         elif tag_thats_done in self.liststarttags:
502             pass
503
504         if tag_thats_done in self.blockleveltags:
505             self.curdata = u''
506
507         self.ignorenodata = False
508
509     def handle_endtag(self, tag):
510         self.ignorenodata = False
511         if tag == "span":
512             return
513
514         try:
515             tagindex = self.opentags.index(tag)
516         except:
517             return
518         tag = tag.lower()
519
520         if tag in [u'br', u'img']:
521             return
522
523         if tag == u'dl':
524             self.indentlevel = self.indentlevel - 4
525
526         if tag in self.liststarttags:
527             if tag in [u'ol', u'dl', u'ul', u'dd']:
528                 self.handle_curdata()
529                 # find if there was a previous list level
530                 smalllist = self.opentags[:-1]
531                 smalllist.reverse()
532                 for prev_listtag in smalllist:
533                     if prev_listtag in [u'ol', u'dl']:
534                         self.indentlevel = self.indentlevel - 4
535                         break
536                     elif prev_listtag == u'ul':
537                         self.indentlevel = self.indentlevel - 3
538                         break
539
540         if tag == u'ol':
541             self.listcount = self.listcount[:-1]
542
543         while tagindex < len(self.opentags) \
544             and tag in self.opentags[tagindex+1:]:
545             try:
546                 tagindex = self.opentags.index(tag, tagindex+1)
547             except:
548                 # well, we don't want to do that then
549                 pass
550         if tagindex != len(self.opentags) - 1:
551             # Assuming the data was for the last opened tag first
552             self.handle_curdata()
553             # Now kill the list to be a slice before this tag was opened
554             self.opentags = self.opentags[:tagindex + 1]
555         else:
556             self.handle_curdata()
557             if self.opentags[-1] == tag:
558                 self.opentags.pop()
559
560     def handle_data(self, data):
561         if len(self.opentags) == 0:
562             self.opentags.append(u'p')
563         self.curdata = self.curdata + data.decode("utf-8")
564
565     def handle_charref(self, name):
566         try:
567             entity = unichr(int(name))
568         except:
569             if name[0] == 'x':
570                 try:
571                     entity = unichr(int('0%s' %(name,), 16))
572                 except:
573                     entity = u'#%s' %(name,)
574             else:
575                 entity = u'#%s' %(name,)
576         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
577             "utf-8")
578
579     def handle_entityref(self, name):
580         entity = name
581         if HTML2Text.entities.has_key(name):
582             entity = HTML2Text.entities[name]
583         else:
584             entity = "&" + name + ";"
585
586         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
587             "utf-8")
588
589     def gettext(self):
590         self.handle_curdata()
591         if len(self.text) == 0 or self.text[-1] != u'\n':
592             self.text = self.text + u'\n'
593         self.opentags = []
594         if len(self.text) > 0:
595             while len(self.text) > 1 and self.text[-1] == u'\n':
596                 self.text = self.text[:-1]
597             self.text = self.text + u'\n'
598         if len(self.urls) > 0:
599             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
600             self.urls = []
601         if len(self.images.keys()) > 0:
602             self.text = self.text + u'\n.. ' \
603                 + u'\n.. '.join( \
604                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
605                 for a in self.images.keys()]) + u'\n'
606             self.images = {}
607         return self.text
608
609 def open_url(method, url):
610     redirectcount = 0
611     while redirectcount < 3:
612         (type, rest) = urllib.splittype(url)
613         (host, path) = urllib.splithost(rest)
614         (host, port) = urllib.splitport(host)
615         if type == "https":
616             if port == None:
617                 port = 443
618         elif port == None:
619             port = 80
620         try:
621             conn = None
622             if type == "http":
623                 conn = httplib.HTTPConnection("%s:%s" %(host, port))
624             else:
625                 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
626             conn.request(method, path)
627             response = conn.getresponse()
628             if response.status in [301, 302, 303, 307]:
629                 headers = response.getheaders()
630                 for header in headers:
631                     if header[0] == "location":
632                         url = header[1]
633             elif response.status == 200:
634                 return response
635         except:
636             pass
637         redirectcount = redirectcount + 1
638     return None
639
640 def parse_and_deliver(maildir, url, statedir):
641     feedhandle = None
642     headers = None
643     # first check if we know about this feed already
644     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
645     if feeddb.has_key(url):
646         data = feeddb[url]
647         data = cgi.parse_qs(data)
648         response = open_url("HEAD", url)
649         headers = None
650         if response:
651             headers = response.getheaders()
652         ischanged = False
653         try:
654             for header in headers:
655                 if header[0] == "content-length":
656                     if header[1] != data["content-length"][0]:
657                         ischanged = True
658                 elif header[0] == "etag":
659                     if header[1] != data["etag"][0]:
660                         ischanged = True
661                 elif header[0] == "last-modified":
662                     if header[1] != data["last-modified"][0]:
663                         ischanged = True
664                 elif header[0] == "content-md5":
665                     if header[1] != data["content-md5"][0]:
666                         ischanged = True
667         except:
668             ischanged = True
669         if ischanged:
670             response = open_url("GET", url)
671             if response != None:
672                 headers = response.getheaders()
673                 feedhandle = response
674             else:
675                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
676                 return
677         else:
678             return # don't need to do anything, nothings changed.
679     else:
680         response = open_url("GET", url)
681         if response != None:
682             headers = response.getheaders()
683             feedhandle = response
684         else:
685             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
686             return
687
688     fp = feedparser.parse(feedhandle)
689     db = dbm.open(os.path.join(statedir, "seen"), "c")
690     for item in fp["items"]:
691         # have we seen it before?
692         # need to work out what the content is first...
693
694         if item.has_key("content"):
695             content = item["content"][0]["value"]
696         else:
697             if item.has_key("description"):
698                 content = item["description"]
699             else:
700                 content = u''
701
702         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
703
704         prevmessageid = None
705
706         db_guid_key = None
707         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
708
709         # check if there's a guid too - if that exists and we match the md5,
710         # return
711         if item.has_key("guid"):
712             db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
713             if db.has_key(db_guid_key):
714                 data = db[db_guid_key]
715                 data = cgi.parse_qs(data)
716                 if data["contentmd5"][0] == md5sum:
717                     continue
718
719         if db.has_key(db_link_key):
720             data = db[db_link_key]
721             data = cgi.parse_qs(data)
722             if data.has_key("message-id"):
723                 prevmessageid = data["message-id"][0]
724             if data["contentmd5"][0] == md5sum:
725                 continue
726
727         try:
728             author = item["author"]
729         except:
730             author = url
731
732         # create a basic email message
733         msg = MIMEMultipart("alternative")
734         messageid = "<" \
735             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
736             + "." \
737             + "".join( \
738                 [random.choice( \
739                     string.ascii_letters + string.digits \
740                     ) for a in range(0,6) \
741                 ]) + "@" + socket.gethostname() + ">"
742         msg.add_header("Message-ID", messageid)
743         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
744         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
745         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
746         if prevmessageid:
747             msg.add_header("References", prevmessageid)
748         createddate = datetime.datetime.now() \
749             .strftime("%a, %e %b %Y %T -0000")
750         try:
751             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
752                 .strftime("%a, %e %b %Y %T -0000")
753         except:
754             pass
755         msg.add_header("Date", createddate)
756         msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
757             .strftime("%a, %e %b %Y %T -0000"))
758         subj_gen = HTML2Text()
759         title = item["title"]
760         title = re.sub(u'<', u'&lt;', title)
761         title = re.sub(u'>', u'&gt;', title)
762         subj_gen.feed(title.encode("utf-8"))
763         msg.add_header("Subject", subj_gen.gettext())
764         msg.set_default_type("text/plain")
765
766         htmlcontent = content.encode("utf-8")
767         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
768             content, \
769             item["link"], \
770             item["link"] )
771         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
772         textparser = HTML2Text()
773         textparser.feed(content.encode("utf-8"))
774         textcontent = textparser.gettext()
775         textcontent = "%s\n\nItem URL: %s" %( \
776             textcontent, \
777             item["link"] )
778         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
779         msg.attach(textpart)
780         msg.attach(htmlpart)
781
782         # start by working out the filename we should be writting to, we do
783         # this following the normal maildir style rules
784         fname = str(os.getpid()) \
785             + "." + socket.gethostname() \
786             + "." + "".join( \
787                 [random.choice( \
788                     string.ascii_letters + string.digits \
789                     ) for a in range(0,10) \
790                 ]) + "." \
791             + datetime.datetime.now().strftime('%s')
792         fn = os.path.join(maildir, "tmp", fname)
793         fh = open(fn, "w")
794         fh.write(msg.as_string())
795         fh.close()
796         # now move it in to the new directory
797         newfn = os.path.join(maildir, "new", fname)
798         os.link(fn, newfn)
799         os.unlink(fn)
800
801         # now add to the database about the item
802         if prevmessageid:
803             messageid = prevmessageid + " " + messageid
804         if item.has_key("guid") and item["guid"] != item["link"]:
805             data = urllib.urlencode(( \
806                 ("message-id", messageid), \
807                 ("created", createddate), \
808                 ("contentmd5", md5sum) \
809                 ))
810             db[db_guid_key] = data
811             try:
812                 data = db[db_link_key]
813                 data = cgi.parse_qs(data)
814                 newdata = urllib.urlencode(( \
815                     ("message-id", messageid), \
816                     ("created", data["created"][0]), \
817                     ("contentmd5", data["contentmd5"][0]) \
818                     ))
819                 db[db_link_key] = newdata
820             except:
821                 db[db_link_key] = data
822         else:
823             data = urllib.urlencode(( \
824                 ("message-id", messageid), \
825                 ("created", createddate), \
826                 ("contentmd5", md5sum) \
827                 ))
828             db[db_link_key] = data
829
830     if headers:
831         data = []
832         for header in headers:
833             if header[0] in \
834                 ["content-md5", "etag", "last-modified", "content-length"]:
835                 data.append((header[0], header[1]))
836         if len(data) > 0:
837             data = urllib.urlencode(data)
838             feeddb[url] = data
839
840     db.close()
841     feeddb.close()
842
843 if __name__ == "__main__":
844     # This only gets executed if we really called the program
845     # first off, parse the command line arguments
846
847     oparser = OptionParser()
848     oparser.add_option(
849         "-c", "--conf", dest="conf",
850         help="location of config file"
851         )
852     oparser.add_option(
853         "-s", "--statedir", dest="statedir",
854         help="location of directory to store state in"
855         )
856
857     (options, args) = oparser.parse_args()
858
859     # check for the configfile
860
861     configfile = None
862
863     if options.conf != None:
864         # does the file exist?
865         try:
866             os.stat(options.conf)
867             configfile = options.conf
868         except:
869             # should exit here as the specified file doesn't exist
870             sys.stderr.write( \
871                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
872             sys.exit(2)
873     else:
874         # check through the default locations
875         try:
876             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
877             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
878         except:
879             try:
880                 os.stat("/etc/rss2maildir.conf")
881                 configfile = "/etc/rss2maildir.conf"
882             except:
883                 sys.stderr.write("No config file found. Exiting.\n")
884                 sys.exit(2)
885
886     # Right - if we've got this far, we've got a config file, now for the hard
887     # bits...
888
889     scp = SafeConfigParser()
890     scp.read(configfile)
891
892     maildir_root = "RSSMaildir"
893     state_dir = "state"
894
895     if options.statedir != None:
896         state_dir = options.statedir
897         try:
898             mode = os.stat(state_dir)[stat.ST_MODE]
899             if not stat.S_ISDIR(mode):
900                 sys.stderr.write( \
901                     "State directory (%s) is not a directory\n" %(state_dir))
902                 sys.exit(1)
903         except:
904             # try to make the directory
905             try:
906                 os.mkdir(state_dir)
907             except:
908                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
909                 sys.exit(1)
910     elif scp.has_option("general", "state_dir"):
911         new_state_dir = scp.get("general", "state_dir")
912         try:
913             mode = os.stat(new_state_dir)[stat.ST_MODE]
914             if not stat.S_ISDIR(mode):
915                 sys.stderr.write( \
916                     "State directory (%s) is not a directory\n" %(state_dir))
917                 sys.exit(1)
918             else:
919                 state_dir = new_state_dir
920         except:
921             # try to create it
922             try:
923                 os.mkdir(new_state_dir)
924                 state_dir = new_state_dir
925             except:
926                 sys.stderr.write( \
927                     "Couldn't create state directory %s\n" %(new_state_dir))
928                 sys.exit(1)
929     else:
930         try:
931             mode = os.stat(state_dir)[stat.ST_MODE]
932             if not stat.S_ISDIR(mode):
933                 sys.stderr.write( \
934                     "State directory %s is not a directory\n" %(state_dir))
935                 sys.exit(1)
936         except:
937             try:
938                 os.mkdir(state_dir)
939             except:
940                 sys.stderr.write( \
941                     "State directory %s could not be created\n" %(state_dir))
942                 sys.exit(1)
943
944     if scp.has_option("general", "maildir_root"):
945         maildir_root = scp.get("general", "maildir_root")
946
947     try:
948         mode = os.stat(maildir_root)[stat.ST_MODE]
949         if not stat.S_ISDIR(mode):
950             sys.stderr.write( \
951                 "Maildir Root %s is not a directory\n" \
952                 %(maildir_root))
953             sys.exit(1)
954     except:
955         try:
956             os.mkdir(maildir_root)
957         except:
958             sys.stderr.write("Couldn't create Maildir Root %s\n" \
959                 %(maildir_root))
960             sys.exit(1)
961
962     feeds = scp.sections()
963     try:
964         feeds.remove("general")
965     except:
966         pass
967
968     for section in feeds:
969         # check if the directory exists
970         maildir = None
971         try:
972             maildir = scp.get(section, "maildir")
973         except:
974             maildir = section
975
976         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
977         maildir = os.path.join(maildir_root, maildir)
978
979         try:
980             exists = os.stat(maildir)
981             if stat.S_ISDIR(exists[stat.ST_MODE]):
982                 # check if there's a new, cur and tmp directory
983                 try:
984                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
985                 except:
986                     os.mkdir(os.path.join(maildir, "cur"))
987                     if not stat.S_ISDIR(mode):
988                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
989                 try:
990                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
991                 except:
992                     os.mkdir(os.path.join(maildir, "tmp"))
993                     if not stat.S_ISDIR(mode):
994                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
995                 try:
996                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
997                     if not stat.S_ISDIR(mode):
998                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
999                 except:
1000                     os.mkdir(os.path.join(maildir, "new"))
1001             else:
1002                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1003         except:
1004             try:
1005                 os.mkdir(maildir)
1006             except:
1007                 sys.stderr.write("Couldn't create root maildir %s\n" \
1008                     %(maildir))
1009                 sys.exit(1)
1010             try:
1011                 os.mkdir(os.path.join(maildir, "new"))
1012                 os.mkdir(os.path.join(maildir, "cur"))
1013                 os.mkdir(os.path.join(maildir, "tmp"))
1014             except:
1015                 sys.stderr.write( \
1016                     "Couldn't create required maildir directories for %s\n" \
1017                     %(section,))
1018                 sys.exit(1)
1019
1020         # right - we've got the directories, we've got the section, we know the
1021         # url... lets play!
1022
1023         parse_and_deliver(maildir, section, state_dir)