Patch from MJ Ray for items without link
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42
43 import chardet
44
45 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
46     import hashlib as md5
47 else:
48     import md5
49
50 import cgi
51 import dbm
52
53 import re
54
55 from HTMLParser import HTMLParser
56
57 class HTML2Text(HTMLParser):
58     entities = {
59         u'amp': u'&',
60         u'lt': u'<',
61         u'gt': u'>',
62         u'pound': u'£',
63         u'copy': u'©',
64         u'apos': u'\'',
65         u'quot': u'"',
66         u'nbsp': u' ',
67         u'ldquo': u'“',
68         u'rdquo': u'”',
69         u'lsquo': u'‘',
70         u'rsquo': u'’',
71         u'laquo': u'«',
72         u'raquo': u'»',
73         u'lsaquo': u'‹',
74         u'rsaquo': u'›',
75         u'bull': u'•',
76         u'middot': u'·',
77         u'deg': u'°',
78         u'helip': u'…',
79         u'trade': u'™',
80         u'reg': u'®',
81         u'agrave': u'à',
82         u'Agrave': u'À',
83         u'egrave': u'è',
84         u'Egrave': u'È',
85         u'igrave': u'ì',
86         u'Igrave': u'Ì',
87         u'ograve': u'ò',
88         u'Ograve': u'Ò',
89         u'ugrave': u'ù',
90         u'Ugrave': u'Ù',
91         u'aacute': u'á',
92         u'Aacute': u'Á',
93         u'eacute': u'é',
94         u'Eacute': u'É',
95         u'iacute': u'í',
96         u'Iacute': u'Í',
97         u'oacute': u'ó',
98         u'Oacute': u'Ó',
99         u'uacute': u'ú',
100         u'Uacute': u'Ú',
101         u'yactue': u'ý',
102         u'Yacute': u'Ý',
103         u'acirc': u'â',
104         u'Acirc': u'Â',
105         u'ecirc': u'ê',
106         u'Ecirc': u'Ê',
107         u'icirc': u'î',
108         u'Icirc': u'Î',
109         u'ocirc': u'ô',
110         u'Ocirc': u'Ô',
111         u'ucirc': u'û',
112         u'Ucirc': u'Û',
113         u'atilde': u'ã',
114         u'Atilde': u'Ã',
115         u'ntilde': u'ñ',
116         u'Ntilde': u'Ñ',
117         u'otilde': u'õ',
118         u'Otilde': u'Õ',
119         u'auml': u'ä',
120         u'Auml': u'Ä',
121         u'euml': u'ë',
122         u'Euml': u'Ë',
123         u'iuml': u'ï',
124         u'Iuml': u'Ï',
125         u'ouml': u'ö',
126         u'Ouml': u'Ö',
127         u'uuml': u'ü',
128         u'Uuml': u'Ü',
129         u'yuml': u'ÿ',
130         u'Yuml': u'Ÿ',
131         u'iexcl': u'¡',
132         u'iquest': u'¿',
133         u'ccedil': u'ç',
134         u'Ccedil': u'Ç',
135         u'oelig': u'œ',
136         u'OElig': u'Œ',
137         u'szlig': u'ß',
138         u'oslash': u'ø',
139         u'Oslash': u'Ø',
140         u'aring': u'å',
141         u'Aring': u'Å',
142         u'aelig': u'æ',
143         u'AElig': u'Æ',
144         u'thorn': u'þ',
145         u'THORN': u'Þ',
146         u'eth': u'ð',
147         u'ETH': u'Ð',
148         u'mdash': u'—',
149         u'ndash': u'–',
150         u'sect': u'§',
151         u'para': u'¶',
152         u'uarr': u'↑',
153         u'darr': u'↓',
154         u'larr': u'←',
155         u'rarr': u'→',
156         u'dagger': u'†',
157         u'Dagger': u'‡',
158         u'permil': u'‰',
159         u'prod': u'∏',
160         u'infin': u'∞',
161         u'radic': u'√',
162         u'there4': u'∴',
163         u'int': u'∫',
164         u'asymp': u'≈',
165         u'ne': u'≠',
166         u'equiv': '≡',
167         u'le': u'≤',
168         u'ge': u'≥',
169         u'loz': u'⋄',
170         u'sum': u'∑',
171         u'part': u'∂',
172         u'prime': u'′',
173         u'Prime': u'″',
174         u'harr': u'↔',
175         u'micro': u'µ',
176         u'not': u'¬',
177         u'plusmn': u'±',
178         u'divide': u'÷',
179         u'cent': u'¢',
180         u'euro': u'€',
181         }
182
183     blockleveltags = [
184         u'h1',
185         u'h2',
186         u'h3',
187         u'h4',
188         u'h5',
189         u'h6',
190         u'pre',
191         u'p',
192         u'ul',
193         u'ol',
194         u'dl',
195         u'li',
196         u'dt',
197         u'dd',
198         u'div',
199         u'blockquote',
200         ]
201
202     liststarttags = [
203         u'ul',
204         u'ol',
205         u'dl',
206         ]
207
208     cancontainflow = [
209         u'div',
210         u'li',
211         u'dd',
212         u'blockquote',
213     ]
214
215     def __init__(self,textwidth=70):
216         self.text = u''
217         self.curdata = u''
218         self.textwidth = textwidth
219         self.opentags = []
220         self.indentlevel = 0
221         self.ignorenodata = False
222         self.listcount = []
223         self.urls = []
224         self.images = {}
225         HTMLParser.__init__(self)
226
227     def handle_starttag(self, tag, attrs):
228         tag_name = tag.lower()
229         if tag_name in self.blockleveltags:
230             # handle starting a new block - unless we're in a block element
231             # that can contain other blocks, we'll assume that we want to close
232             # the container
233             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
234                 self.handle_curdata()
235
236             if tag_name == u'ol':
237                 self.handle_curdata()
238                 self.listcount.append(1)
239                 self.listlevel = len(self.listcount) - 1
240
241             if tag_name == u'dl':
242                 self.indentlevel = self.indentlevel + 4
243
244             if tag_name in self.liststarttags:
245                 smallist = self.opentags[-3:-1]
246                 smallist.reverse()
247                 for prev_listtag in smallist:
248                     if prev_listtag in [u'dl', u'ol']:
249                         self.indentlevel = self.indentlevel + 4
250                         break
251                     elif prev_listtag == u'ul':
252                         self.indentlevel = self.indentlevel + 3
253                         break
254
255             if len(self.opentags) > 0:
256                 self.handle_curdata()
257                 if tag_name not in self.cancontainflow:
258                     self.opentags.pop()
259             self.opentags.append(tag_name)
260         else:
261             if tag_name == "span":
262                 return
263             listcount = 0
264             try:
265                 listcount = self.listcount[-1]
266             except:
267                 pass
268
269             if tag_name == u'dd' and len(self.opentags) > 1 \
270                 and self.opentags[-1] == u'dt':
271                 self.handle_curdata()
272                 self.opentags.pop()
273             elif tag_name == u'dt' and len(self.opentags) > 1 \
274                 and self.opentags[-1] == u'dd':
275                 self.handle_curdata()
276                 self.opentags.pop()
277             elif tag_name == u'a':
278                 for attr in attrs:
279                     if attr[0].lower() == u'href':
280                         self.urls.append(attr[1])
281                 self.curdata = self.curdata + u'`'
282                 self.opentags.append(tag_name)
283                 return
284             elif tag_name == u'img':
285                 self.handle_image(attrs)
286                 return
287             elif tag_name == u'br':
288                 self.handle_br()
289                 return
290             else:
291                 # we don't know the tag, so lets avoid handling it!
292                 return 
293
294     def handle_startendtag(self, tag, attrs):
295         if tag.lower() == u'br':
296             self.handle_br()
297         elif tag.lower() == u'img':
298             self.handle_image(attrs)
299             return
300
301     def handle_br(self):
302             self.handle_curdata()
303             self.opentags.append(u'br')
304             self.handle_curdata()
305             self.opentags.pop()
306
307     def handle_image(self, attrs):
308         alt = u''
309         url = u''
310         for attr in attrs:
311             if attr[0] == 'alt':
312                 if isinstance(attr[1], str):
313                     alt = u'%s' %(attr[1])
314                 else:
315                     alt = attr[1]
316             elif attr[0] == 'src':
317                 if isinstance(attr[1], str):
318                     url = u'%s' %(attr[1])
319                 else:
320                     url = attr[1]
321         if url:
322             if alt:
323                 if self.images.has_key(alt):
324                     if self.images[alt]["url"] == url:
325                         self.curdata = self.curdata \
326                             + u'|%s|' %(alt,)
327                     else:
328                         while self.images.has_key(alt):
329                             alt = alt + "_"
330                         self.images[alt] = {"url": url}
331                         self.curdata = self.curdata \
332                             + u'|%s|' %(alt,)
333                 else:
334                     self.images[alt] = {"url": url}
335                     self.curdata = self.curdata \
336                         + u'|%s|' %(alt,)
337             else:
338                 if self.images.has_key(url):
339                     self.curdata = self.curdata \
340                         + u'|%s|' %(url,)
341                 else:
342                     self.images[url] = {}
343                     self.images[url]["url"] =url
344                     self.curdata = self.curdata \
345                         + u'|%s|' %(url,)
346
347     def handle_curdata(self):
348
349         if len(self.opentags) == 0:
350             return
351
352         tag_thats_done = self.opentags[-1]
353
354         if len(self.curdata) == 0:
355             return
356
357         if tag_thats_done == u'br':
358             if len(self.text) == 0 or self.text[-1] != '\n':
359                 self.text = self.text + '\n'
360                 self.ignorenodata = True
361             return
362
363         if len(self.curdata.strip()) == 0:
364             return
365
366         if tag_thats_done in self.blockleveltags:
367             newlinerequired = self.text != u''
368             if self.ignorenodata:
369                 newlinerequired = False
370             self.ignorenodata = False
371             if newlinerequired:
372                 if tag_thats_done in [u'dt', u'dd', u'li'] \
373                     and len(self.text) > 1 \
374                     and self.text[-1] != u'\n':
375                         self.text = self.text + u'\n'
376                 elif len(self.text) > 2 \
377                     and self.text[-1] != u'\n' \
378                     and self.text[-2] != u'\n':
379                     self.text = self.text + u'\n\n'
380
381         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
382             underline = u''
383             underlinechar = u'='
384             headingtext = " ".join(self.curdata.split())
385             seperator = u'\n' + u' '*self.indentlevel
386             headingtext = seperator.join( \
387                 textwrap.wrap( \
388                     headingtext, \
389                     self.textwidth - self.indentlevel \
390                     ) \
391                 )
392
393             if tag_thats_done == u'h2':
394                 underlinechar = u'-'
395             elif tag_thats_done != u'h1':
396                 underlinechar = u'~'
397
398             if u'\n' in headingtext:
399                 underline = u' ' * self.indentlevel \
400                     + underlinechar * (self.textwidth - self.indentlevel)
401             else:
402                 underline = u' ' * self.indentlevel \
403                     + underlinechar * len(headingtext)
404             self.text = self.text \
405                 + headingtext + u'\n' \
406                 + underline
407         elif tag_thats_done in [u'p', u'div']:
408             paragraph = unicode( \
409                 " ".join(self.curdata.strip().encode("utf-8").split()), \
410                 "utf-8")
411             seperator = u'\n' + u' ' * self.indentlevel
412             self.text = self.text \
413                 + u' ' * self.indentlevel \
414                 + seperator.join( \
415                     textwrap.wrap( \
416                         paragraph, self.textwidth - self.indentlevel))
417         elif tag_thats_done == "pre":
418             self.text = self.text + unicode( \
419                 self.curdata.encode("utf-8"), "utf-8")
420         elif tag_thats_done == u'blockquote':
421             quote = unicode( \
422                 " ".join(self.curdata.encode("utf-8").strip().split()), \
423                 "utf-8")
424             seperator = u'\n' + u' ' * self.indentlevel + u'    '
425             if len(self.text) > 0 and self.text[-1] != u'\n':
426                 self.text = self.text + u'\n'
427             self.text = self.text \
428                 + u'    ' \
429                 + seperator.join( \
430                     textwrap.wrap( \
431                         quote, \
432                         self.textwidth - self.indentlevel - 2 \
433                     )
434                 )
435             self.curdata = u''
436         elif tag_thats_done == "li":
437             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
438             if len(self.text) > 0 and self.text[-1] != u'\n':
439                 self.text = self.text + u'\n'
440             # work out if we're in an ol rather than a ul
441             latesttags = self.opentags[-4:]
442             latesttags.reverse()
443             isul = None
444             for thing in latesttags:
445                 if thing == 'ul':
446                     isul = True
447                     break
448                 elif thing == 'ol':
449                     isul = False
450                     break
451
452             listindent = 3
453             if not isul:
454                 listindent = 4
455
456             listmarker = u' * '
457             if isul == False:
458                 listmarker = u' %2d. ' %(self.listcount[-1])
459                 self.listcount[-1] = self.listcount[-1] + 1
460
461             seperator = u'\n' \
462                 + u' ' * self.indentlevel \
463                 + u' ' * listindent
464             self.text = self.text \
465                 + u' ' * self.indentlevel \
466                 + listmarker \
467                 + seperator.join( \
468                     textwrap.wrap( \
469                         item, \
470                         self.textwidth - self.indentlevel - listindent \
471                     ) \
472                 )
473             self.curdata = u''
474         elif tag_thats_done == u'dt':
475             definition = unicode(" ".join( \
476                     self.curdata.encode("utf-8").strip().split()), \
477                 "utf-8")
478             if len(self.text) > 0 and self.text[-1] != u'\n':
479                 self.text = self.text + u'\n\n'
480             elif len(self.text) > 1 and self.text[-2] != u'\n':
481                 self.text = self.text + u'\n'
482             definition = u' ' * (self.indentlevel - 4) + definition + "::"
483             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
484             self.text = self.text \
485                 + indentstring.join(
486                     textwrap.wrap(definition, \
487                         self.textwidth - self.indentlevel - 4))
488             self.curdata = u''
489         elif tag_thats_done == u'dd':
490             definition = unicode(" ".join( \
491                     self.curdata.encode("utf-8").strip().split()),
492                 "utf-8")
493             if len(definition) > 0:
494                 if len(self.text) > 0 and self.text[-1] != u'\n':
495                     self.text = self.text + u'\n'
496                 indentstring = u'\n' + u' ' * self.indentlevel
497                 self.text = self.text \
498                     + indentstring \
499                     + indentstring.join( \
500                         textwrap.wrap( \
501                             definition, \
502                             self.textwidth - self.indentlevel \
503                             ) \
504                         )
505                 self.curdata = u''
506         elif tag_thats_done == u'a':
507             self.curdata = self.curdata + u'`__'
508             pass
509         elif tag_thats_done in self.liststarttags:
510             pass
511
512         if tag_thats_done in self.blockleveltags:
513             self.curdata = u''
514
515         self.ignorenodata = False
516
517     def handle_endtag(self, tag):
518         self.ignorenodata = False
519         if tag == "span":
520             return
521
522         try:
523             tagindex = self.opentags.index(tag)
524         except:
525             return
526         tag = tag.lower()
527
528         if tag in [u'br', u'img']:
529             return
530
531         if tag == u'dl':
532             self.indentlevel = self.indentlevel - 4
533
534         if tag in self.liststarttags:
535             if tag in [u'ol', u'dl', u'ul', u'dd']:
536                 self.handle_curdata()
537                 # find if there was a previous list level
538                 smalllist = self.opentags[:-1]
539                 smalllist.reverse()
540                 for prev_listtag in smalllist:
541                     if prev_listtag in [u'ol', u'dl']:
542                         self.indentlevel = self.indentlevel - 4
543                         break
544                     elif prev_listtag == u'ul':
545                         self.indentlevel = self.indentlevel - 3
546                         break
547
548         if tag == u'ol':
549             self.listcount = self.listcount[:-1]
550
551         while tagindex < len(self.opentags) \
552             and tag in self.opentags[tagindex+1:]:
553             try:
554                 tagindex = self.opentags.index(tag, tagindex+1)
555             except:
556                 # well, we don't want to do that then
557                 pass
558         if tagindex != len(self.opentags) - 1:
559             # Assuming the data was for the last opened tag first
560             self.handle_curdata()
561             # Now kill the list to be a slice before this tag was opened
562             self.opentags = self.opentags[:tagindex + 1]
563         else:
564             self.handle_curdata()
565             if self.opentags[-1] == tag:
566                 self.opentags.pop()
567
568     def handle_data(self, data):
569         if len(self.opentags) == 0:
570             self.opentags.append(u'p')
571         self.curdata = "%s%s" %(self.curdata, data)
572
573     def handle_charref(self, name):
574         try:
575             entity = unichr(int(name))
576         except:
577             if name[0] == 'x':
578                 try:
579                     entity = unichr(int('0%s' %(name,), 16))
580                 except:
581                     entity = u'#%s' %(name,)
582             else:
583                 entity = u'#%s' %(name,)
584         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
585             "utf-8")
586
587     def handle_entityref(self, name):
588         entity = name
589         if HTML2Text.entities.has_key(name):
590             entity = HTML2Text.entities[name]
591         else:
592             entity = "&" + name + ";"
593
594         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
595             "utf-8")
596
597     def gettext(self):
598         self.handle_curdata()
599         if len(self.text) == 0 or self.text[-1] != u'\n':
600             self.text = self.text + u'\n'
601         self.opentags = []
602         if len(self.text) > 0:
603             while len(self.text) > 1 and self.text[-1] == u'\n':
604                 self.text = self.text[:-1]
605             self.text = self.text + u'\n'
606         if len(self.urls) > 0:
607             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
608             self.urls = []
609         if len(self.images.keys()) > 0:
610             self.text = self.text + u'\n.. ' \
611                 + u'\n.. '.join( \
612                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
613                 for a in self.images.keys()]) + u'\n'
614             self.images = {}
615         return self.text
616
617 def open_url(method, url):
618     redirectcount = 0
619     while redirectcount < 3:
620         (type, rest) = urllib.splittype(url)
621         (host, path) = urllib.splithost(rest)
622         (host, port) = urllib.splitport(host)
623         if type == "https":
624             if port == None:
625                 port = 443
626         elif port == None:
627             port = 80
628         try:
629             conn = None
630             if type == "http":
631                 conn = httplib.HTTPConnection("%s:%s" %(host, port))
632             else:
633                 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
634             conn.request(method, path)
635             response = conn.getresponse()
636             if response.status in [301, 302, 303, 307]:
637                 headers = response.getheaders()
638                 for header in headers:
639                     if header[0] == "location":
640                         url = header[1]
641             elif response.status == 200:
642                 return response
643         except:
644             pass
645         redirectcount = redirectcount + 1
646     return None
647
648 def parse_and_deliver(maildir, url, statedir):
649     feedhandle = None
650     headers = None
651     # first check if we know about this feed already
652     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
653     if feeddb.has_key(url):
654         data = feeddb[url]
655         data = cgi.parse_qs(data)
656         response = open_url("HEAD", url)
657         headers = None
658         if response:
659             headers = response.getheaders()
660         ischanged = False
661         try:
662             for header in headers:
663                 if header[0] == "content-length":
664                     if header[1] != data["content-length"][0]:
665                         ischanged = True
666                 elif header[0] == "etag":
667                     if header[1] != data["etag"][0]:
668                         ischanged = True
669                 elif header[0] == "last-modified":
670                     if header[1] != data["last-modified"][0]:
671                         ischanged = True
672                 elif header[0] == "content-md5":
673                     if header[1] != data["content-md5"][0]:
674                         ischanged = True
675         except:
676             ischanged = True
677         if ischanged:
678             response = open_url("GET", url)
679             if response != None:
680                 headers = response.getheaders()
681                 feedhandle = response
682             else:
683                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
684                 return
685         else:
686             return # don't need to do anything, nothings changed.
687     else:
688         response = open_url("GET", url)
689         if response != None:
690             headers = response.getheaders()
691             feedhandle = response
692         else:
693             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
694             return
695
696     fp = feedparser.parse(feedhandle)
697     db = dbm.open(os.path.join(statedir, "seen"), "c")
698     for item in fp["items"]:
699         # have we seen it before?
700         # need to work out what the content is first...
701
702         if item.has_key("content"):
703             content = item["content"][0]["value"]
704         else:
705             if item.has_key("description"):
706                 content = item["description"]
707             else:
708                 content = u''
709
710         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
711
712         # make sure content is unicode encoded
713         if not isinstance(content, unicode):
714             cd_res = chardet.detect(content)
715             chrset = cd_res['encoding']
716             print "detected charset %s for item %s" %(chrset, item["link"])
717             content = content.decode(chrset)
718
719         prevmessageid = None
720
721         db_guid_key = None
722         if not item.has_key("link"):
723             item["link"] = u'#' + md5sum
724         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
725
726         # check if there's a guid too - if that exists and we match the md5,
727         # return
728         if item.has_key("guid"):
729             db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
730             if db.has_key(db_guid_key):
731                 data = db[db_guid_key]
732                 data = cgi.parse_qs(data)
733                 if data["contentmd5"][0] == md5sum:
734                     continue
735
736         if db.has_key(db_link_key):
737             data = db[db_link_key]
738             data = cgi.parse_qs(data)
739             if data.has_key("message-id"):
740                 prevmessageid = data["message-id"][0]
741             if data["contentmd5"][0] == md5sum:
742                 continue
743
744         try:
745             author = item["author"]
746         except:
747             author = url
748
749         # create a basic email message
750         msg = MIMEMultipart("alternative")
751         messageid = "<" \
752             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
753             + "." \
754             + "".join( \
755                 [random.choice( \
756                     string.ascii_letters + string.digits \
757                     ) for a in range(0,6) \
758                 ]) + "@" + socket.gethostname() + ">"
759         msg.add_header("Message-ID", messageid)
760         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
761         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
762         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
763         if prevmessageid:
764             msg.add_header("References", prevmessageid)
765         createddate = datetime.datetime.now() \
766             .strftime("%a, %e %b %Y %T -0000")
767         try:
768             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
769                 .strftime("%a, %e %b %Y %T -0000")
770         except:
771             pass
772         msg.add_header("Date", createddate)
773         msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
774             .strftime("%a, %e %b %Y %T -0000"))
775         subj_gen = HTML2Text()
776         title = item["title"]
777         title = re.sub(u'<', u'&lt;', title)
778         title = re.sub(u'>', u'&gt;', title)
779         subj_gen.feed(title)
780         msg.add_header("Subject", subj_gen.gettext())
781         msg.set_default_type("text/plain")
782
783         htmlcontent = content.encode("utf-8")
784         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
785             content, \
786             item["link"], \
787             item["link"] )
788         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
789         textparser = HTML2Text()
790         textparser.feed(content)
791         textcontent = textparser.gettext()
792         textcontent = "%s\n\nItem URL: %s" %( \
793             textcontent, \
794             item["link"] )
795         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
796         msg.attach(textpart)
797         msg.attach(htmlpart)
798
799         # start by working out the filename we should be writting to, we do
800         # this following the normal maildir style rules
801         fname = str(os.getpid()) \
802             + "." + socket.gethostname() \
803             + "." + "".join( \
804                 [random.choice( \
805                     string.ascii_letters + string.digits \
806                     ) for a in range(0,10) \
807                 ]) + "." \
808             + datetime.datetime.now().strftime('%s')
809         fn = os.path.join(maildir, "tmp", fname)
810         fh = open(fn, "w")
811         fh.write(msg.as_string())
812         fh.close()
813         # now move it in to the new directory
814         newfn = os.path.join(maildir, "new", fname)
815         os.link(fn, newfn)
816         os.unlink(fn)
817
818         # now add to the database about the item
819         if prevmessageid:
820             messageid = prevmessageid + " " + messageid
821         if item.has_key("guid") and item["guid"] != item["link"]:
822             data = urllib.urlencode(( \
823                 ("message-id", messageid), \
824                 ("created", createddate), \
825                 ("contentmd5", md5sum) \
826                 ))
827             db[db_guid_key] = data
828             try:
829                 data = db[db_link_key]
830                 data = cgi.parse_qs(data)
831                 newdata = urllib.urlencode(( \
832                     ("message-id", messageid), \
833                     ("created", data["created"][0]), \
834                     ("contentmd5", data["contentmd5"][0]) \
835                     ))
836                 db[db_link_key] = newdata
837             except:
838                 db[db_link_key] = data
839         else:
840             data = urllib.urlencode(( \
841                 ("message-id", messageid), \
842                 ("created", createddate), \
843                 ("contentmd5", md5sum) \
844                 ))
845             db[db_link_key] = data
846
847     if headers:
848         data = []
849         for header in headers:
850             if header[0] in \
851                 ["content-md5", "etag", "last-modified", "content-length"]:
852                 data.append((header[0], header[1]))
853         if len(data) > 0:
854             data = urllib.urlencode(data)
855             feeddb[url] = data
856
857     db.close()
858     feeddb.close()
859
860 if __name__ == "__main__":
861     # This only gets executed if we really called the program
862     # first off, parse the command line arguments
863
864     oparser = OptionParser()
865     oparser.add_option(
866         "-c", "--conf", dest="conf",
867         help="location of config file"
868         )
869     oparser.add_option(
870         "-s", "--statedir", dest="statedir",
871         help="location of directory to store state in"
872         )
873
874     (options, args) = oparser.parse_args()
875
876     # check for the configfile
877
878     configfile = None
879
880     if options.conf != None:
881         # does the file exist?
882         try:
883             os.stat(options.conf)
884             configfile = options.conf
885         except:
886             # should exit here as the specified file doesn't exist
887             sys.stderr.write( \
888                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
889             sys.exit(2)
890     else:
891         # check through the default locations
892         try:
893             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
894             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
895         except:
896             try:
897                 os.stat("/etc/rss2maildir.conf")
898                 configfile = "/etc/rss2maildir.conf"
899             except:
900                 sys.stderr.write("No config file found. Exiting.\n")
901                 sys.exit(2)
902
903     # Right - if we've got this far, we've got a config file, now for the hard
904     # bits...
905
906     scp = SafeConfigParser()
907     scp.read(configfile)
908
909     maildir_root = "RSSMaildir"
910     state_dir = "state"
911
912     if options.statedir != None:
913         state_dir = options.statedir
914         try:
915             mode = os.stat(state_dir)[stat.ST_MODE]
916             if not stat.S_ISDIR(mode):
917                 sys.stderr.write( \
918                     "State directory (%s) is not a directory\n" %(state_dir))
919                 sys.exit(1)
920         except:
921             # try to make the directory
922             try:
923                 os.mkdir(state_dir)
924             except:
925                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
926                 sys.exit(1)
927     elif scp.has_option("general", "state_dir"):
928         new_state_dir = scp.get("general", "state_dir")
929         try:
930             mode = os.stat(new_state_dir)[stat.ST_MODE]
931             if not stat.S_ISDIR(mode):
932                 sys.stderr.write( \
933                     "State directory (%s) is not a directory\n" %(state_dir))
934                 sys.exit(1)
935             else:
936                 state_dir = new_state_dir
937         except:
938             # try to create it
939             try:
940                 os.mkdir(new_state_dir)
941                 state_dir = new_state_dir
942             except:
943                 sys.stderr.write( \
944                     "Couldn't create state directory %s\n" %(new_state_dir))
945                 sys.exit(1)
946     else:
947         try:
948             mode = os.stat(state_dir)[stat.ST_MODE]
949             if not stat.S_ISDIR(mode):
950                 sys.stderr.write( \
951                     "State directory %s is not a directory\n" %(state_dir))
952                 sys.exit(1)
953         except:
954             try:
955                 os.mkdir(state_dir)
956             except:
957                 sys.stderr.write( \
958                     "State directory %s could not be created\n" %(state_dir))
959                 sys.exit(1)
960
961     if scp.has_option("general", "maildir_root"):
962         maildir_root = scp.get("general", "maildir_root")
963
964     try:
965         mode = os.stat(maildir_root)[stat.ST_MODE]
966         if not stat.S_ISDIR(mode):
967             sys.stderr.write( \
968                 "Maildir Root %s is not a directory\n" \
969                 %(maildir_root))
970             sys.exit(1)
971     except:
972         try:
973             os.mkdir(maildir_root)
974         except:
975             sys.stderr.write("Couldn't create Maildir Root %s\n" \
976                 %(maildir_root))
977             sys.exit(1)
978
979     feeds = scp.sections()
980     try:
981         feeds.remove("general")
982     except:
983         pass
984
985     for section in feeds:
986         # check if the directory exists
987         maildir = None
988         try:
989             maildir = scp.get(section, "maildir")
990         except:
991             maildir = section
992
993         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
994         maildir = os.path.join(maildir_root, maildir)
995
996         try:
997             exists = os.stat(maildir)
998             if stat.S_ISDIR(exists[stat.ST_MODE]):
999                 # check if there's a new, cur and tmp directory
1000                 try:
1001                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
1002                 except:
1003                     os.mkdir(os.path.join(maildir, "cur"))
1004                     if not stat.S_ISDIR(mode):
1005                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1006                 try:
1007                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
1008                 except:
1009                     os.mkdir(os.path.join(maildir, "tmp"))
1010                     if not stat.S_ISDIR(mode):
1011                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1012                 try:
1013                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
1014                     if not stat.S_ISDIR(mode):
1015                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1016                 except:
1017                     os.mkdir(os.path.join(maildir, "new"))
1018             else:
1019                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1020         except:
1021             try:
1022                 os.mkdir(maildir)
1023             except:
1024                 sys.stderr.write("Couldn't create root maildir %s\n" \
1025                     %(maildir))
1026                 sys.exit(1)
1027             try:
1028                 os.mkdir(os.path.join(maildir, "new"))
1029                 os.mkdir(os.path.join(maildir, "cur"))
1030                 os.mkdir(os.path.join(maildir, "tmp"))
1031             except:
1032                 sys.stderr.write( \
1033                     "Couldn't create required maildir directories for %s\n" \
1034                     %(section,))
1035                 sys.exit(1)
1036
1037         # right - we've got the directories, we've got the section, we know the
1038         # url... lets play!
1039
1040         parse_and_deliver(maildir, section, state_dir)