]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
Make sure that we feed the parser unicode data
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42
43 import chardet
44
45 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
46     import hashlib as md5
47 else:
48     import md5
49
50 import cgi
51 import dbm
52
53 import re
54
55 from HTMLParser import HTMLParser
56
57 class HTML2Text(HTMLParser):
58     entities = {
59         u'amp': u'&',
60         u'lt': u'<',
61         u'gt': u'>',
62         u'pound': u'£',
63         u'copy': u'©',
64         u'apos': u'\'',
65         u'quot': u'"',
66         u'nbsp': u' ',
67         u'ldquo': u'“',
68         u'rdquo': u'”',
69         u'lsquo': u'‘',
70         u'rsquo': u'’',
71         u'laquo': u'«',
72         u'raquo': u'»',
73         u'lsaquo': u'‹',
74         u'rsaquo': u'›',
75         u'bull': u'•',
76         u'middot': u'·',
77         u'deg': u'°',
78         u'helip': u'…',
79         u'trade': u'™',
80         u'reg': u'®',
81         u'agrave': u'à',
82         u'Agrave': u'À',
83         u'egrave': u'è',
84         u'Egrave': u'È',
85         u'igrave': u'ì',
86         u'Igrave': u'Ì',
87         u'ograve': u'ò',
88         u'Ograve': u'Ò',
89         u'ugrave': u'ù',
90         u'Ugrave': u'Ù',
91         u'aacute': u'á',
92         u'Aacute': u'Á',
93         u'eacute': u'é',
94         u'Eacute': u'É',
95         u'iacute': u'í',
96         u'Iacute': u'Í',
97         u'oacute': u'ó',
98         u'Oacute': u'Ó',
99         u'uacute': u'ú',
100         u'Uacute': u'Ú',
101         u'yactue': u'ý',
102         u'Yacute': u'Ý',
103         u'acirc': u'â',
104         u'Acirc': u'Â',
105         u'ecirc': u'ê',
106         u'Ecirc': u'Ê',
107         u'icirc': u'î',
108         u'Icirc': u'Î',
109         u'ocirc': u'ô',
110         u'Ocirc': u'Ô',
111         u'ucirc': u'û',
112         u'Ucirc': u'Û',
113         u'atilde': u'ã',
114         u'Atilde': u'Ã',
115         u'ntilde': u'ñ',
116         u'Ntilde': u'Ñ',
117         u'otilde': u'õ',
118         u'Otilde': u'Õ',
119         u'auml': u'ä',
120         u'Auml': u'Ä',
121         u'euml': u'ë',
122         u'Euml': u'Ë',
123         u'iuml': u'ï',
124         u'Iuml': u'Ï',
125         u'ouml': u'ö',
126         u'Ouml': u'Ö',
127         u'uuml': u'ü',
128         u'Uuml': u'Ü',
129         u'yuml': u'ÿ',
130         u'Yuml': u'Ÿ',
131         u'iexcl': u'¡',
132         u'iquest': u'¿',
133         u'ccedil': u'ç',
134         u'Ccedil': u'Ç',
135         u'oelig': u'œ',
136         u'OElig': u'Œ',
137         u'szlig': u'ß',
138         u'oslash': u'ø',
139         u'Oslash': u'Ø',
140         u'aring': u'å',
141         u'Aring': u'Å',
142         u'aelig': u'æ',
143         u'AElig': u'Æ',
144         u'thorn': u'þ',
145         u'THORN': u'Þ',
146         u'eth': u'ð',
147         u'ETH': u'Ð',
148         u'mdash': u'—',
149         u'ndash': u'–',
150         u'sect': u'§',
151         u'para': u'¶',
152         u'uarr': u'↑',
153         u'darr': u'↓',
154         u'larr': u'←',
155         u'rarr': u'→',
156         u'dagger': u'†',
157         u'Dagger': u'‡',
158         u'permil': u'‰',
159         u'prod': u'∏',
160         u'infin': u'∞',
161         u'radic': u'√',
162         u'there4': u'∴',
163         u'int': u'∫',
164         u'asymp': u'≈',
165         u'ne': u'≠',
166         u'equiv': '≡',
167         u'le': u'≤',
168         u'ge': u'≥',
169         u'loz': u'⋄',
170         u'sum': u'∑',
171         u'part': u'∂',
172         u'prime': u'′',
173         u'Prime': u'″',
174         u'harr': u'↔',
175         u'micro': u'µ',
176         u'not': u'¬',
177         u'plusmn': u'±',
178         u'divide': u'÷',
179         u'cent': u'¢',
180         u'euro': u'€',
181         }
182
183     blockleveltags = [
184         u'h1',
185         u'h2',
186         u'h3',
187         u'h4',
188         u'h5',
189         u'h6',
190         u'pre',
191         u'p',
192         u'ul',
193         u'ol',
194         u'dl',
195         u'li',
196         u'dt',
197         u'dd',
198         u'div',
199         u'blockquote',
200         ]
201
202     liststarttags = [
203         u'ul',
204         u'ol',
205         u'dl',
206         ]
207
208     cancontainflow = [
209         u'div',
210         u'li',
211         u'dd',
212         u'blockquote',
213     ]
214
215     def __init__(self,textwidth=70):
216         self.text = u''
217         self.curdata = u''
218         self.textwidth = textwidth
219         self.opentags = []
220         self.indentlevel = 0
221         self.ignorenodata = False
222         self.listcount = []
223         self.urls = []
224         self.images = {}
225         HTMLParser.__init__(self)
226
227     def handle_starttag(self, tag, attrs):
228         tag_name = tag.lower()
229         if tag_name in self.blockleveltags:
230             # handle starting a new block - unless we're in a block element
231             # that can contain other blocks, we'll assume that we want to close
232             # the container
233             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
234                 self.handle_curdata()
235
236             if tag_name == u'ol':
237                 self.handle_curdata()
238                 self.listcount.append(1)
239                 self.listlevel = len(self.listcount) - 1
240
241             if tag_name == u'dl':
242                 self.indentlevel = self.indentlevel + 4
243
244             if tag_name in self.liststarttags:
245                 smallist = self.opentags[-3:-1]
246                 smallist.reverse()
247                 for prev_listtag in smallist:
248                     if prev_listtag in [u'dl', u'ol']:
249                         self.indentlevel = self.indentlevel + 4
250                         break
251                     elif prev_listtag == u'ul':
252                         self.indentlevel = self.indentlevel + 3
253                         break
254
255             if len(self.opentags) > 0:
256                 self.handle_curdata()
257                 if tag_name not in self.cancontainflow:
258                     self.opentags.pop()
259             self.opentags.append(tag_name)
260         else:
261             if tag_name == "span":
262                 return
263             listcount = 0
264             try:
265                 listcount = self.listcount[-1]
266             except:
267                 pass
268
269             if tag_name == u'dd' and len(self.opentags) > 1 \
270                 and self.opentags[-1] == u'dt':
271                 self.handle_curdata()
272                 self.opentags.pop()
273             elif tag_name == u'dt' and len(self.opentags) > 1 \
274                 and self.opentags[-1] == u'dd':
275                 self.handle_curdata()
276                 self.opentags.pop()
277             elif tag_name == u'a':
278                 for attr in attrs:
279                     if attr[0].lower() == u'href':
280                         self.urls.append(attr[1])
281                 self.curdata = self.curdata + u'`'
282                 self.opentags.append(tag_name)
283                 return
284             elif tag_name == u'img':
285                 self.handle_image(attrs)
286                 return
287             elif tag_name == u'br':
288                 self.handle_br()
289                 return
290             else:
291                 # we don't know the tag, so lets avoid handling it!
292                 return 
293
294     def handle_startendtag(self, tag, attrs):
295         if tag.lower() == u'br':
296             self.handle_br()
297         elif tag.lower() == u'img':
298             self.handle_image(attrs)
299             return
300
301     def handle_br(self):
302             self.handle_curdata()
303             self.opentags.append(u'br')
304             self.handle_curdata()
305             self.opentags.pop()
306
307     def handle_image(self, attrs):
308         alt = u''
309         url = u''
310         for attr in attrs:
311             if attr[0] == 'alt':
312                 if isinstance(attr[1], str):
313                     alt = u'%s' %(attr[1])
314                 else:
315                     alt = attr[1]
316             elif attr[0] == 'src':
317                 if isinstance(attr[1], str):
318                     url = u'%s' %(attr[1])
319                 else:
320                     url = attr[1]
321         if url:
322             if alt:
323                 if self.images.has_key(alt):
324                     if self.images[alt]["url"] == url:
325                         self.curdata = self.curdata \
326                             + u'|%s|' %(alt,)
327                     else:
328                         while self.images.has_key(alt):
329                             alt = alt + "_"
330                         self.images[alt] = {"url": url}
331                         self.curdata = self.curdata \
332                             + u'|%s|' %(alt,)
333                 else:
334                     self.images[alt] = {"url": url}
335                     self.curdata = self.curdata \
336                         + u'|%s|' %(alt,)
337             else:
338                 if self.images.has_key(url):
339                     self.curdata = self.curdata \
340                         + u'|%s|' %(url,)
341                 else:
342                     self.images[url] = {}
343                     self.images[url]["url"] =url
344                     self.curdata = self.curdata \
345                         + u'|%s|' %(url,)
346
347     def handle_curdata(self):
348
349         if len(self.opentags) == 0:
350             return
351
352         tag_thats_done = self.opentags[-1]
353
354         if len(self.curdata) == 0:
355             return
356
357         if tag_thats_done == u'br':
358             if len(self.text) == 0 or self.text[-1] != '\n':
359                 self.text = self.text + '\n'
360                 self.ignorenodata = True
361             return
362
363         if len(self.curdata.strip()) == 0:
364             return
365
366         if tag_thats_done in self.blockleveltags:
367             newlinerequired = self.text != u''
368             if self.ignorenodata:
369                 newlinerequired = False
370             self.ignorenodata = False
371             if newlinerequired:
372                 if tag_thats_done in [u'dt', u'dd', u'li'] \
373                     and len(self.text) > 1 \
374                     and self.text[-1] != u'\n':
375                         self.text = self.text + u'\n'
376                 elif len(self.text) > 2 \
377                     and self.text[-1] != u'\n' \
378                     and self.text[-2] != u'\n':
379                     self.text = self.text + u'\n\n'
380
381         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
382             underline = u''
383             underlinechar = u'='
384             headingtext = " ".join(self.curdata.split())
385             seperator = u'\n' + u' '*self.indentlevel
386             headingtext = seperator.join( \
387                 textwrap.wrap( \
388                     headingtext, \
389                     self.textwidth - self.indentlevel \
390                     ) \
391                 )
392
393             if tag_thats_done == u'h2':
394                 underlinechar = u'-'
395             elif tag_thats_done != u'h1':
396                 underlinechar = u'~'
397
398             if u'\n' in headingtext:
399                 underline = u' ' * self.indentlevel \
400                     + underlinechar * (self.textwidth - self.indentlevel)
401             else:
402                 underline = u' ' * self.indentlevel \
403                     + underlinechar * len(headingtext)
404             self.text = self.text \
405                 + headingtext + u'\n' \
406                 + underline
407         elif tag_thats_done in [u'p', u'div']:
408             paragraph = unicode( \
409                 " ".join(self.curdata.strip().encode("utf-8").split()), \
410                 "utf-8")
411             seperator = u'\n' + u' ' * self.indentlevel
412             self.text = self.text \
413                 + u' ' * self.indentlevel \
414                 + seperator.join( \
415                     textwrap.wrap( \
416                         paragraph, self.textwidth - self.indentlevel))
417         elif tag_thats_done == "pre":
418             self.text = self.text + unicode( \
419                 self.curdata.encode("utf-8"), "utf-8")
420         elif tag_thats_done == u'blockquote':
421             quote = unicode( \
422                 " ".join(self.curdata.encode("utf-8").strip().split()), \
423                 "utf-8")
424             seperator = u'\n' + u' ' * self.indentlevel + u'    '
425             if len(self.text) > 0 and self.text[-1] != u'\n':
426                 self.text = self.text + u'\n'
427             self.text = self.text \
428                 + u'    ' \
429                 + seperator.join( \
430                     textwrap.wrap( \
431                         quote, \
432                         self.textwidth - self.indentlevel - 2 \
433                     )
434                 )
435             self.curdata = u''
436         elif tag_thats_done == "li":
437             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
438             if len(self.text) > 0 and self.text[-1] != u'\n':
439                 self.text = self.text + u'\n'
440             # work out if we're in an ol rather than a ul
441             latesttags = self.opentags[-4:]
442             latesttags.reverse()
443             isul = None
444             for thing in latesttags:
445                 if thing == 'ul':
446                     isul = True
447                     break
448                 elif thing == 'ol':
449                     isul = False
450                     break
451
452             listindent = 3
453             if not isul:
454                 listindent = 4
455
456             listmarker = u' * '
457             if isul == False:
458                 listmarker = u' %2d. ' %(self.listcount[-1])
459                 self.listcount[-1] = self.listcount[-1] + 1
460
461             seperator = u'\n' \
462                 + u' ' * self.indentlevel \
463                 + u' ' * listindent
464             self.text = self.text \
465                 + u' ' * self.indentlevel \
466                 + listmarker \
467                 + seperator.join( \
468                     textwrap.wrap( \
469                         item, \
470                         self.textwidth - self.indentlevel - listindent \
471                     ) \
472                 )
473             self.curdata = u''
474         elif tag_thats_done == u'dt':
475             definition = unicode(" ".join( \
476                     self.curdata.encode("utf-8").strip().split()), \
477                 "utf-8")
478             if len(self.text) > 0 and self.text[-1] != u'\n':
479                 self.text = self.text + u'\n\n'
480             elif len(self.text) > 1 and self.text[-2] != u'\n':
481                 self.text = self.text + u'\n'
482             definition = u' ' * (self.indentlevel - 4) + definition + "::"
483             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
484             self.text = self.text \
485                 + indentstring.join(
486                     textwrap.wrap(definition, \
487                         self.textwidth - self.indentlevel - 4))
488             self.curdata = u''
489         elif tag_thats_done == u'dd':
490             definition = unicode(" ".join( \
491                     self.curdata.encode("utf-8").strip().split()),
492                 "utf-8")
493             if len(definition) > 0:
494                 if len(self.text) > 0 and self.text[-1] != u'\n':
495                     self.text = self.text + u'\n'
496                 indentstring = u'\n' + u' ' * self.indentlevel
497                 self.text = self.text \
498                     + indentstring \
499                     + indentstring.join( \
500                         textwrap.wrap( \
501                             definition, \
502                             self.textwidth - self.indentlevel \
503                             ) \
504                         )
505                 self.curdata = u''
506         elif tag_thats_done == u'a':
507             self.curdata = self.curdata + u'`__'
508             pass
509         elif tag_thats_done in self.liststarttags:
510             pass
511
512         if tag_thats_done in self.blockleveltags:
513             self.curdata = u''
514
515         self.ignorenodata = False
516
517     def handle_endtag(self, tag):
518         self.ignorenodata = False
519         if tag == "span":
520             return
521
522         try:
523             tagindex = self.opentags.index(tag)
524         except:
525             return
526         tag = tag.lower()
527
528         if tag in [u'br', u'img']:
529             return
530
531         if tag == u'dl':
532             self.indentlevel = self.indentlevel - 4
533
534         if tag in self.liststarttags:
535             if tag in [u'ol', u'dl', u'ul', u'dd']:
536                 self.handle_curdata()
537                 # find if there was a previous list level
538                 smalllist = self.opentags[:-1]
539                 smalllist.reverse()
540                 for prev_listtag in smalllist:
541                     if prev_listtag in [u'ol', u'dl']:
542                         self.indentlevel = self.indentlevel - 4
543                         break
544                     elif prev_listtag == u'ul':
545                         self.indentlevel = self.indentlevel - 3
546                         break
547
548         if tag == u'ol':
549             self.listcount = self.listcount[:-1]
550
551         while tagindex < len(self.opentags) \
552             and tag in self.opentags[tagindex+1:]:
553             try:
554                 tagindex = self.opentags.index(tag, tagindex+1)
555             except:
556                 # well, we don't want to do that then
557                 pass
558         if tagindex != len(self.opentags) - 1:
559             # Assuming the data was for the last opened tag first
560             self.handle_curdata()
561             # Now kill the list to be a slice before this tag was opened
562             self.opentags = self.opentags[:tagindex + 1]
563         else:
564             self.handle_curdata()
565             if self.opentags[-1] == tag:
566                 self.opentags.pop()
567
568     def handle_data(self, data):
569         if len(self.opentags) == 0:
570             self.opentags.append(u'p')
571         self.curdata = "%s%s" %(self.curdata, data)
572
573     def handle_charref(self, name):
574         try:
575             entity = unichr(int(name))
576         except:
577             if name[0] == 'x':
578                 try:
579                     entity = unichr(int('0%s' %(name,), 16))
580                 except:
581                     entity = u'#%s' %(name,)
582             else:
583                 entity = u'#%s' %(name,)
584         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
585             "utf-8")
586
587     def handle_entityref(self, name):
588         entity = name
589         if HTML2Text.entities.has_key(name):
590             entity = HTML2Text.entities[name]
591         else:
592             entity = "&" + name + ";"
593
594         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
595             "utf-8")
596
597     def gettext(self):
598         self.handle_curdata()
599         if len(self.text) == 0 or self.text[-1] != u'\n':
600             self.text = self.text + u'\n'
601         self.opentags = []
602         if len(self.text) > 0:
603             while len(self.text) > 1 and self.text[-1] == u'\n':
604                 self.text = self.text[:-1]
605             self.text = self.text + u'\n'
606         if len(self.urls) > 0:
607             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
608             self.urls = []
609         if len(self.images.keys()) > 0:
610             self.text = self.text + u'\n.. ' \
611                 + u'\n.. '.join( \
612                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
613                 for a in self.images.keys()]) + u'\n'
614             self.images = {}
615         return self.text
616
617 def open_url(method, url):
618     redirectcount = 0
619     while redirectcount < 3:
620         (type, rest) = urllib.splittype(url)
621         (host, path) = urllib.splithost(rest)
622         (host, port) = urllib.splitport(host)
623         if type == "https":
624             if port == None:
625                 port = 443
626         elif port == None:
627             port = 80
628         try:
629             conn = None
630             if type == "http":
631                 conn = httplib.HTTPConnection("%s:%s" %(host, port))
632             else:
633                 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
634             conn.request(method, path)
635             response = conn.getresponse()
636             if response.status in [301, 302, 303, 307]:
637                 headers = response.getheaders()
638                 for header in headers:
639                     if header[0] == "location":
640                         url = header[1]
641             elif response.status == 200:
642                 return response
643         except:
644             pass
645         redirectcount = redirectcount + 1
646     return None
647
648 def parse_and_deliver(maildir, url, statedir):
649     feedhandle = None
650     headers = None
651     # first check if we know about this feed already
652     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
653     if feeddb.has_key(url):
654         data = feeddb[url]
655         data = cgi.parse_qs(data)
656         response = open_url("HEAD", url)
657         headers = None
658         if response:
659             headers = response.getheaders()
660         ischanged = False
661         try:
662             for header in headers:
663                 if header[0] == "content-length":
664                     if header[1] != data["content-length"][0]:
665                         ischanged = True
666                 elif header[0] == "etag":
667                     if header[1] != data["etag"][0]:
668                         ischanged = True
669                 elif header[0] == "last-modified":
670                     if header[1] != data["last-modified"][0]:
671                         ischanged = True
672                 elif header[0] == "content-md5":
673                     if header[1] != data["content-md5"][0]:
674                         ischanged = True
675         except:
676             ischanged = True
677         if ischanged:
678             response = open_url("GET", url)
679             if response != None:
680                 headers = response.getheaders()
681                 feedhandle = response
682             else:
683                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
684                 return
685         else:
686             return # don't need to do anything, nothings changed.
687     else:
688         response = open_url("GET", url)
689         if response != None:
690             headers = response.getheaders()
691             feedhandle = response
692         else:
693             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
694             return
695
696     fp = feedparser.parse(feedhandle)
697     db = dbm.open(os.path.join(statedir, "seen"), "c")
698     for item in fp["items"]:
699         # have we seen it before?
700         # need to work out what the content is first...
701
702         if item.has_key("content"):
703             content = item["content"][0]["value"]
704         else:
705             if item.has_key("description"):
706                 content = item["description"]
707             else:
708                 content = u''
709
710         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
711
712         # make sure content is unicode encoded
713         if not isinstance(content, unicode):
714             cd_res = chardet.detect(content)
715             chrset = cd_res['encoding']
716             print "detected charset %s for item %s" %(chrset, item["link"])
717             content = content.decode(chrset)
718
719         prevmessageid = None
720
721         db_guid_key = None
722         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
723
724         # check if there's a guid too - if that exists and we match the md5,
725         # return
726         if item.has_key("guid"):
727             db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
728             if db.has_key(db_guid_key):
729                 data = db[db_guid_key]
730                 data = cgi.parse_qs(data)
731                 if data["contentmd5"][0] == md5sum:
732                     continue
733
734         if db.has_key(db_link_key):
735             data = db[db_link_key]
736             data = cgi.parse_qs(data)
737             if data.has_key("message-id"):
738                 prevmessageid = data["message-id"][0]
739             if data["contentmd5"][0] == md5sum:
740                 continue
741
742         try:
743             author = item["author"]
744         except:
745             author = url
746
747         # create a basic email message
748         msg = MIMEMultipart("alternative")
749         messageid = "<" \
750             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
751             + "." \
752             + "".join( \
753                 [random.choice( \
754                     string.ascii_letters + string.digits \
755                     ) for a in range(0,6) \
756                 ]) + "@" + socket.gethostname() + ">"
757         msg.add_header("Message-ID", messageid)
758         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
759         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
760         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
761         if prevmessageid:
762             msg.add_header("References", prevmessageid)
763         createddate = datetime.datetime.now() \
764             .strftime("%a, %e %b %Y %T -0000")
765         try:
766             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
767                 .strftime("%a, %e %b %Y %T -0000")
768         except:
769             pass
770         msg.add_header("Date", createddate)
771         msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
772             .strftime("%a, %e %b %Y %T -0000"))
773         subj_gen = HTML2Text()
774         title = item["title"]
775         title = re.sub(u'<', u'&lt;', title)
776         title = re.sub(u'>', u'&gt;', title)
777         subj_gen.feed(title)
778         msg.add_header("Subject", subj_gen.gettext())
779         msg.set_default_type("text/plain")
780
781         htmlcontent = content.encode("utf-8")
782         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
783             content, \
784             item["link"], \
785             item["link"] )
786         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
787         textparser = HTML2Text()
788         textparser.feed(content)
789         textcontent = textparser.gettext()
790         textcontent = "%s\n\nItem URL: %s" %( \
791             textcontent, \
792             item["link"] )
793         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
794         msg.attach(textpart)
795         msg.attach(htmlpart)
796
797         # start by working out the filename we should be writting to, we do
798         # this following the normal maildir style rules
799         fname = str(os.getpid()) \
800             + "." + socket.gethostname() \
801             + "." + "".join( \
802                 [random.choice( \
803                     string.ascii_letters + string.digits \
804                     ) for a in range(0,10) \
805                 ]) + "." \
806             + datetime.datetime.now().strftime('%s')
807         fn = os.path.join(maildir, "tmp", fname)
808         fh = open(fn, "w")
809         fh.write(msg.as_string())
810         fh.close()
811         # now move it in to the new directory
812         newfn = os.path.join(maildir, "new", fname)
813         os.link(fn, newfn)
814         os.unlink(fn)
815
816         # now add to the database about the item
817         if prevmessageid:
818             messageid = prevmessageid + " " + messageid
819         if item.has_key("guid") and item["guid"] != item["link"]:
820             data = urllib.urlencode(( \
821                 ("message-id", messageid), \
822                 ("created", createddate), \
823                 ("contentmd5", md5sum) \
824                 ))
825             db[db_guid_key] = data
826             try:
827                 data = db[db_link_key]
828                 data = cgi.parse_qs(data)
829                 newdata = urllib.urlencode(( \
830                     ("message-id", messageid), \
831                     ("created", data["created"][0]), \
832                     ("contentmd5", data["contentmd5"][0]) \
833                     ))
834                 db[db_link_key] = newdata
835             except:
836                 db[db_link_key] = data
837         else:
838             data = urllib.urlencode(( \
839                 ("message-id", messageid), \
840                 ("created", createddate), \
841                 ("contentmd5", md5sum) \
842                 ))
843             db[db_link_key] = data
844
845     if headers:
846         data = []
847         for header in headers:
848             if header[0] in \
849                 ["content-md5", "etag", "last-modified", "content-length"]:
850                 data.append((header[0], header[1]))
851         if len(data) > 0:
852             data = urllib.urlencode(data)
853             feeddb[url] = data
854
855     db.close()
856     feeddb.close()
857
858 if __name__ == "__main__":
859     # This only gets executed if we really called the program
860     # first off, parse the command line arguments
861
862     oparser = OptionParser()
863     oparser.add_option(
864         "-c", "--conf", dest="conf",
865         help="location of config file"
866         )
867     oparser.add_option(
868         "-s", "--statedir", dest="statedir",
869         help="location of directory to store state in"
870         )
871
872     (options, args) = oparser.parse_args()
873
874     # check for the configfile
875
876     configfile = None
877
878     if options.conf != None:
879         # does the file exist?
880         try:
881             os.stat(options.conf)
882             configfile = options.conf
883         except:
884             # should exit here as the specified file doesn't exist
885             sys.stderr.write( \
886                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
887             sys.exit(2)
888     else:
889         # check through the default locations
890         try:
891             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
892             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
893         except:
894             try:
895                 os.stat("/etc/rss2maildir.conf")
896                 configfile = "/etc/rss2maildir.conf"
897             except:
898                 sys.stderr.write("No config file found. Exiting.\n")
899                 sys.exit(2)
900
901     # Right - if we've got this far, we've got a config file, now for the hard
902     # bits...
903
904     scp = SafeConfigParser()
905     scp.read(configfile)
906
907     maildir_root = "RSSMaildir"
908     state_dir = "state"
909
910     if options.statedir != None:
911         state_dir = options.statedir
912         try:
913             mode = os.stat(state_dir)[stat.ST_MODE]
914             if not stat.S_ISDIR(mode):
915                 sys.stderr.write( \
916                     "State directory (%s) is not a directory\n" %(state_dir))
917                 sys.exit(1)
918         except:
919             # try to make the directory
920             try:
921                 os.mkdir(state_dir)
922             except:
923                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
924                 sys.exit(1)
925     elif scp.has_option("general", "state_dir"):
926         new_state_dir = scp.get("general", "state_dir")
927         try:
928             mode = os.stat(new_state_dir)[stat.ST_MODE]
929             if not stat.S_ISDIR(mode):
930                 sys.stderr.write( \
931                     "State directory (%s) is not a directory\n" %(state_dir))
932                 sys.exit(1)
933             else:
934                 state_dir = new_state_dir
935         except:
936             # try to create it
937             try:
938                 os.mkdir(new_state_dir)
939                 state_dir = new_state_dir
940             except:
941                 sys.stderr.write( \
942                     "Couldn't create state directory %s\n" %(new_state_dir))
943                 sys.exit(1)
944     else:
945         try:
946             mode = os.stat(state_dir)[stat.ST_MODE]
947             if not stat.S_ISDIR(mode):
948                 sys.stderr.write( \
949                     "State directory %s is not a directory\n" %(state_dir))
950                 sys.exit(1)
951         except:
952             try:
953                 os.mkdir(state_dir)
954             except:
955                 sys.stderr.write( \
956                     "State directory %s could not be created\n" %(state_dir))
957                 sys.exit(1)
958
959     if scp.has_option("general", "maildir_root"):
960         maildir_root = scp.get("general", "maildir_root")
961
962     try:
963         mode = os.stat(maildir_root)[stat.ST_MODE]
964         if not stat.S_ISDIR(mode):
965             sys.stderr.write( \
966                 "Maildir Root %s is not a directory\n" \
967                 %(maildir_root))
968             sys.exit(1)
969     except:
970         try:
971             os.mkdir(maildir_root)
972         except:
973             sys.stderr.write("Couldn't create Maildir Root %s\n" \
974                 %(maildir_root))
975             sys.exit(1)
976
977     feeds = scp.sections()
978     try:
979         feeds.remove("general")
980     except:
981         pass
982
983     for section in feeds:
984         # check if the directory exists
985         maildir = None
986         try:
987             maildir = scp.get(section, "maildir")
988         except:
989             maildir = section
990
991         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
992         maildir = os.path.join(maildir_root, maildir)
993
994         try:
995             exists = os.stat(maildir)
996             if stat.S_ISDIR(exists[stat.ST_MODE]):
997                 # check if there's a new, cur and tmp directory
998                 try:
999                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
1000                 except:
1001                     os.mkdir(os.path.join(maildir, "cur"))
1002                     if not stat.S_ISDIR(mode):
1003                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1004                 try:
1005                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
1006                 except:
1007                     os.mkdir(os.path.join(maildir, "tmp"))
1008                     if not stat.S_ISDIR(mode):
1009                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1010                 try:
1011                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
1012                     if not stat.S_ISDIR(mode):
1013                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1014                 except:
1015                     os.mkdir(os.path.join(maildir, "new"))
1016             else:
1017                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1018         except:
1019             try:
1020                 os.mkdir(maildir)
1021             except:
1022                 sys.stderr.write("Couldn't create root maildir %s\n" \
1023                     %(maildir))
1024                 sys.exit(1)
1025             try:
1026                 os.mkdir(os.path.join(maildir, "new"))
1027                 os.mkdir(os.path.join(maildir, "cur"))
1028                 os.mkdir(os.path.join(maildir, "tmp"))
1029             except:
1030                 sys.stderr.write( \
1031                     "Couldn't create required maildir directories for %s\n" \
1032                     %(section,))
1033                 sys.exit(1)
1034
1035         # right - we've got the directories, we've got the section, we know the
1036         # url... lets play!
1037
1038         parse_and_deliver(maildir, section, state_dir)