]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
More utf-8 handling for images
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
44     import hashlib as md5
45 else:
46     import md5
47
48 import cgi
49 import dbm
50
51 import re
52
53 from HTMLParser import HTMLParser
54
55 class HTML2Text(HTMLParser):
56     entities = {
57         u'amp': u'&',
58         u'lt': u'<',
59         u'gt': u'>',
60         u'pound': u'£',
61         u'copy': u'©',
62         u'apos': u'\'',
63         u'quot': u'"',
64         u'nbsp': u' ',
65         u'ldquo': u'“',
66         u'rdquo': u'”',
67         u'lsquo': u'‘',
68         u'rsquo': u'’',
69         u'laquo': u'«',
70         u'raquo': u'»',
71         u'lsaquo': u'‹',
72         u'rsaquo': u'›',
73         u'bull': u'•',
74         u'middot': u'·',
75         u'deg': u'°',
76         u'helip': u'…',
77         u'trade': u'™',
78         u'reg': u'®',
79         u'agrave': u'à',
80         u'Agrave': u'À',
81         u'egrave': u'è',
82         u'Egrave': u'È',
83         u'igrave': u'ì',
84         u'Igrave': u'Ì',
85         u'ograve': u'ò',
86         u'Ograve': u'Ò',
87         u'ugrave': u'ù',
88         u'Ugrave': u'Ù',
89         u'aacute': u'á',
90         u'Aacute': u'Á',
91         u'eacute': u'é',
92         u'Eacute': u'É',
93         u'iacute': u'í',
94         u'Iacute': u'Í',
95         u'oacute': u'ó',
96         u'Oacute': u'Ó',
97         u'uacute': u'ú',
98         u'Uacute': u'Ú',
99         u'yactue': u'ý',
100         u'Yacute': u'Ý',
101         u'acirc': u'â',
102         u'Acirc': u'Â',
103         u'ecirc': u'ê',
104         u'Ecirc': u'Ê',
105         u'icirc': u'î',
106         u'Icirc': u'Î',
107         u'ocirc': u'ô',
108         u'Ocirc': u'Ô',
109         u'ucirc': u'û',
110         u'Ucirc': u'Û',
111         u'atilde': u'ã',
112         u'Atilde': u'Ã',
113         u'ntilde': u'ñ',
114         u'Ntilde': u'Ñ',
115         u'otilde': u'õ',
116         u'Otilde': u'Õ',
117         u'auml': u'ä',
118         u'Auml': u'Ä',
119         u'euml': u'ë',
120         u'Euml': u'Ë',
121         u'iuml': u'ï',
122         u'Iuml': u'Ï',
123         u'ouml': u'ö',
124         u'Ouml': u'Ö',
125         u'uuml': u'ü',
126         u'Uuml': u'Ü',
127         u'yuml': u'ÿ',
128         u'Yuml': u'Ÿ',
129         u'iexcl': u'¡',
130         u'iquest': u'¿',
131         u'ccedil': u'ç',
132         u'Ccedil': u'Ç',
133         u'oelig': u'œ',
134         u'OElig': u'Œ',
135         u'szlig': u'ß',
136         u'oslash': u'ø',
137         u'Oslash': u'Ø',
138         u'aring': u'å',
139         u'Aring': u'Å',
140         u'aelig': u'æ',
141         u'AElig': u'Æ',
142         u'thorn': u'þ',
143         u'THORN': u'Þ',
144         u'eth': u'ð',
145         u'ETH': u'Ð',
146         u'mdash': u'—',
147         u'ndash': u'–',
148         u'sect': u'§',
149         u'para': u'¶',
150         u'uarr': u'↑',
151         u'darr': u'↓',
152         u'larr': u'←',
153         u'rarr': u'→',
154         u'dagger': u'†',
155         u'Dagger': u'‡',
156         u'permil': u'‰',
157         u'prod': u'∏',
158         u'infin': u'∞',
159         u'radic': u'√',
160         u'there4': u'∴',
161         u'int': u'∫',
162         u'asymp': u'≈',
163         u'ne': u'≠',
164         u'equiv': '≡',
165         u'le': u'≤',
166         u'ge': u'≥',
167         u'loz': u'⋄',
168         u'sum': u'∑',
169         u'part': u'∂',
170         u'prime': u'′',
171         u'Prime': u'″',
172         u'harr': u'↔',
173         u'micro': u'µ',
174         u'not': u'¬',
175         u'plusmn': u'±',
176         u'divide': u'÷',
177         u'cent': u'¢',
178         u'euro': u'€',
179         }
180
181     blockleveltags = [
182         u'h1',
183         u'h2',
184         u'h3',
185         u'h4',
186         u'h5',
187         u'h6',
188         u'pre',
189         u'p',
190         u'ul',
191         u'ol',
192         u'dl',
193         u'li',
194         u'dt',
195         u'dd',
196         u'div',
197         u'blockquote',
198         ]
199
200     liststarttags = [
201         u'ul',
202         u'ol',
203         u'dl',
204         ]
205
206     cancontainflow = [
207         u'div',
208         u'li',
209         u'dd',
210         u'blockquote',
211     ]
212
213     def __init__(self,textwidth=70):
214         self.text = u''
215         self.curdata = u''
216         self.textwidth = textwidth
217         self.opentags = []
218         self.indentlevel = 0
219         self.ignorenodata = False
220         self.listcount = []
221         self.urls = []
222         self.images = {}
223         HTMLParser.__init__(self)
224
225     def handle_starttag(self, tag, attrs):
226         tag_name = tag.lower()
227         if tag_name in self.blockleveltags:
228             # handle starting a new block - unless we're in a block element
229             # that can contain other blocks, we'll assume that we want to close
230             # the container
231             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232                 self.handle_curdata()
233
234             if tag_name == u'ol':
235                 self.handle_curdata()
236                 self.listcount.append(1)
237                 self.listlevel = len(self.listcount) - 1
238
239             if tag_name == u'dl':
240                 self.indentlevel = self.indentlevel + 4
241
242             if tag_name in self.liststarttags:
243                 smallist = self.opentags[-3:-1]
244                 smallist.reverse()
245                 for prev_listtag in smallist:
246                     if prev_listtag in [u'dl', u'ol']:
247                         self.indentlevel = self.indentlevel + 4
248                         break
249                     elif prev_listtag == u'ul':
250                         self.indentlevel = self.indentlevel + 3
251                         break
252
253             if len(self.opentags) > 0:
254                 self.handle_curdata()
255                 if tag_name not in self.cancontainflow:
256                     self.opentags.pop()
257             self.opentags.append(tag_name)
258         else:
259             if tag_name == "span":
260                 return
261             listcount = 0
262             try:
263                 listcount = self.listcount[-1]
264             except:
265                 pass
266
267             if tag_name == u'dd' and len(self.opentags) > 1 \
268                 and self.opentags[-1] == u'dt':
269                 self.handle_curdata()
270                 self.opentags.pop()
271             elif tag_name == u'dt' and len(self.opentags) > 1 \
272                 and self.opentags[-1] == u'dd':
273                 self.handle_curdata()
274                 self.opentags.pop()
275             elif tag_name == u'a':
276                 for attr in attrs:
277                     if attr[0].lower() == u'href':
278                         self.urls.append(attr[1].decode('utf-8'))
279                 self.curdata = self.curdata + u'`'
280                 self.opentags.append(tag_name)
281                 return
282             elif tag_name == u'img':
283                 self.handle_image(attrs)
284                 return
285             elif tag_name == u'br':
286                 self.handle_br()
287                 return
288             else:
289                 # we don't know the tag, so lets avoid handling it!
290                 return 
291
292     def handle_startendtag(self, tag, attrs):
293         if tag.lower() == u'br':
294             self.handle_br()
295         elif tag.lower() == u'img':
296             self.handle_image(attrs)
297             return
298
299     def handle_br(self):
300             self.handle_curdata()
301             self.opentags.append(u'br')
302             self.handle_curdata()
303             self.opentags.pop()
304
305     def handle_image(self, attrs):
306         alt = u''
307         url = u''
308         for attr in attrs:
309             if attr[0] == 'alt':
310                 if isinstance(attr[1], str):
311                     alt = u'%s' %(attr[1].decode("utf-8"))
312                 else:
313                     alt = attr[1]
314             elif attr[0] == 'src':
315                 if isinstance(attr[1], str):
316                     url = u'%s' %(attr[1].decode("utf-8"))
317                 else:
318                     url = attr[1]
319         if url:
320             if alt:
321                 if self.images.has_key(alt):
322                     if self.images[alt]["url"] == url:
323                         self.curdata = self.curdata \
324                             + u'|%s|' %(alt,)
325                     else:
326                         while self.images.has_key(alt):
327                             alt = alt + "_"
328                         self.images[alt] = {"url": url}
329                         self.curdata = self.curdata \
330                             + u'|%s|' %(alt,)
331                 else:
332                     self.images[alt] = {"url": url}
333                     self.curdata = self.curdata \
334                         + u'|%s|' %(alt,)
335             else:
336                 if self.images.has_key(url):
337                     self.curdata = self.curdata \
338                         + u'|%s|' %(url,)
339                 else:
340                     self.images[url] = {}
341                     self.images[url]["url"] =url
342                     self.curdata = self.curdata \
343                         + u'|%s|' %(url,)
344
345     def handle_curdata(self):
346
347         if len(self.opentags) == 0:
348             return
349
350         tag_thats_done = self.opentags[-1]
351
352         if len(self.curdata) == 0:
353             return
354
355         if tag_thats_done == u'br':
356             if len(self.text) == 0 or self.text[-1] != '\n':
357                 self.text = self.text + '\n'
358                 self.ignorenodata = True
359             return
360
361         if len(self.curdata.strip()) == 0:
362             return
363
364         if tag_thats_done in self.blockleveltags:
365             newlinerequired = self.text != u''
366             if self.ignorenodata:
367                 newlinerequired = False
368             self.ignorenodata = False
369             if newlinerequired:
370                 if tag_thats_done in [u'dt', u'dd', u'li'] \
371                     and len(self.text) > 1 \
372                     and self.text[-1] != u'\n':
373                         self.text = self.text + u'\n'
374                 elif len(self.text) > 2 \
375                     and self.text[-1] != u'\n' \
376                     and self.text[-2] != u'\n':
377                     self.text = self.text + u'\n\n'
378
379         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
380             underline = u''
381             underlinechar = u'='
382             headingtext = " ".join(self.curdata.split())
383             seperator = u'\n' + u' '*self.indentlevel
384             headingtext = seperator.join( \
385                 textwrap.wrap( \
386                     headingtext, \
387                     self.textwidth - self.indentlevel \
388                     ) \
389                 )
390
391             if tag_thats_done == u'h2':
392                 underlinechar = u'-'
393             elif tag_thats_done != u'h1':
394                 underlinechar = u'~'
395
396             if u'\n' in headingtext:
397                 underline = u' ' * self.indentlevel \
398                     + underlinechar * (self.textwidth - self.indentlevel)
399             else:
400                 underline = u' ' * self.indentlevel \
401                     + underlinechar * len(headingtext)
402             self.text = self.text \
403                 + headingtext + u'\n' \
404                 + underline
405         elif tag_thats_done in [u'p', u'div']:
406             paragraph = unicode( \
407                 " ".join(self.curdata.strip().encode("utf-8").split()), \
408                 "utf-8")
409             seperator = u'\n' + u' ' * self.indentlevel
410             self.text = self.text \
411                 + u' ' * self.indentlevel \
412                 + seperator.join( \
413                     textwrap.wrap( \
414                         paragraph, self.textwidth - self.indentlevel))
415         elif tag_thats_done == "pre":
416             self.text = self.text + unicode( \
417                 self.curdata.encode("utf-8"), "utf-8")
418         elif tag_thats_done == u'blockquote':
419             quote = unicode( \
420                 " ".join(self.curdata.encode("utf-8").strip().split()), \
421                 "utf-8")
422             seperator = u'\n' + u' ' * self.indentlevel + u'    '
423             if len(self.text) > 0 and self.text[-1] != u'\n':
424                 self.text = self.text + u'\n'
425             self.text = self.text \
426                 + u'    ' \
427                 + seperator.join( \
428                     textwrap.wrap( \
429                         quote, \
430                         self.textwidth - self.indentlevel - 2 \
431                     )
432                 )
433             self.curdata = u''
434         elif tag_thats_done == "li":
435             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
436             if len(self.text) > 0 and self.text[-1] != u'\n':
437                 self.text = self.text + u'\n'
438             # work out if we're in an ol rather than a ul
439             latesttags = self.opentags[-4:]
440             latesttags.reverse()
441             isul = None
442             for thing in latesttags:
443                 if thing == 'ul':
444                     isul = True
445                     break
446                 elif thing == 'ol':
447                     isul = False
448                     break
449
450             listindent = 3
451             if not isul:
452                 listindent = 4
453
454             listmarker = u' * '
455             if isul == False:
456                 listmarker = u' %2d. ' %(self.listcount[-1])
457                 self.listcount[-1] = self.listcount[-1] + 1
458
459             seperator = u'\n' \
460                 + u' ' * self.indentlevel \
461                 + u' ' * listindent
462             self.text = self.text \
463                 + u' ' * self.indentlevel \
464                 + listmarker \
465                 + seperator.join( \
466                     textwrap.wrap( \
467                         item, \
468                         self.textwidth - self.indentlevel - listindent \
469                     ) \
470                 )
471             self.curdata = u''
472         elif tag_thats_done == u'dt':
473             definition = unicode(" ".join( \
474                     self.curdata.encode("utf-8").strip().split()), \
475                 "utf-8")
476             if len(self.text) > 0 and self.text[-1] != u'\n':
477                 self.text = self.text + u'\n\n'
478             elif len(self.text) > 1 and self.text[-2] != u'\n':
479                 self.text = self.text + u'\n'
480             definition = u' ' * (self.indentlevel - 4) + definition + "::"
481             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
482             self.text = self.text \
483                 + indentstring.join(
484                     textwrap.wrap(definition, \
485                         self.textwidth - self.indentlevel - 4))
486             self.curdata = u''
487         elif tag_thats_done == u'dd':
488             definition = unicode(" ".join( \
489                     self.curdata.encode("utf-8").strip().split()),
490                 "utf-8")
491             if len(definition) > 0:
492                 if len(self.text) > 0 and self.text[-1] != u'\n':
493                     self.text = self.text + u'\n'
494                 indentstring = u'\n' + u' ' * self.indentlevel
495                 self.text = self.text \
496                     + indentstring \
497                     + indentstring.join( \
498                         textwrap.wrap( \
499                             definition, \
500                             self.textwidth - self.indentlevel \
501                             ) \
502                         )
503                 self.curdata = u''
504         elif tag_thats_done == u'a':
505             self.curdata = self.curdata + u'`__'
506             pass
507         elif tag_thats_done in self.liststarttags:
508             pass
509
510         if tag_thats_done in self.blockleveltags:
511             self.curdata = u''
512
513         self.ignorenodata = False
514
515     def handle_endtag(self, tag):
516         self.ignorenodata = False
517         if tag == "span":
518             return
519
520         try:
521             tagindex = self.opentags.index(tag)
522         except:
523             return
524         tag = tag.lower()
525
526         if tag in [u'br', u'img']:
527             return
528
529         if tag == u'dl':
530             self.indentlevel = self.indentlevel - 4
531
532         if tag in self.liststarttags:
533             if tag in [u'ol', u'dl', u'ul', u'dd']:
534                 self.handle_curdata()
535                 # find if there was a previous list level
536                 smalllist = self.opentags[:-1]
537                 smalllist.reverse()
538                 for prev_listtag in smalllist:
539                     if prev_listtag in [u'ol', u'dl']:
540                         self.indentlevel = self.indentlevel - 4
541                         break
542                     elif prev_listtag == u'ul':
543                         self.indentlevel = self.indentlevel - 3
544                         break
545
546         if tag == u'ol':
547             self.listcount = self.listcount[:-1]
548
549         while tagindex < len(self.opentags) \
550             and tag in self.opentags[tagindex+1:]:
551             try:
552                 tagindex = self.opentags.index(tag, tagindex+1)
553             except:
554                 # well, we don't want to do that then
555                 pass
556         if tagindex != len(self.opentags) - 1:
557             # Assuming the data was for the last opened tag first
558             self.handle_curdata()
559             # Now kill the list to be a slice before this tag was opened
560             self.opentags = self.opentags[:tagindex + 1]
561         else:
562             self.handle_curdata()
563             if self.opentags[-1] == tag:
564                 self.opentags.pop()
565
566     def handle_data(self, data):
567         if len(self.opentags) == 0:
568             self.opentags.append(u'p')
569         self.curdata = self.curdata + data.decode("utf-8")
570
571     def handle_charref(self, name):
572         try:
573             entity = unichr(int(name))
574         except:
575             if name[0] == 'x':
576                 try:
577                     entity = unichr(int('0%s' %(name,), 16))
578                 except:
579                     entity = u'#%s' %(name,)
580             else:
581                 entity = u'#%s' %(name,)
582         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
583             "utf-8")
584
585     def handle_entityref(self, name):
586         entity = name
587         if HTML2Text.entities.has_key(name):
588             entity = HTML2Text.entities[name]
589         else:
590             entity = "&" + name + ";"
591
592         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
593             "utf-8")
594
595     def gettext(self):
596         self.handle_curdata()
597         if len(self.text) == 0 or self.text[-1] != u'\n':
598             self.text = self.text + u'\n'
599         self.opentags = []
600         if len(self.text) > 0:
601             while len(self.text) > 1 and self.text[-1] == u'\n':
602                 self.text = self.text[:-1]
603             self.text = self.text + u'\n'
604         if len(self.urls) > 0:
605             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
606             self.urls = []
607         if len(self.images.keys()) > 0:
608             self.text = self.text + u'\n.. ' \
609                 + u'\n.. '.join( \
610                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
611                 for a in self.images.keys()]) + u'\n'
612             self.images = {}
613         return self.text
614
615 def open_url(method, url):
616     redirectcount = 0
617     while redirectcount < 3:
618         (type, rest) = urllib.splittype(url)
619         (host, path) = urllib.splithost(rest)
620         (host, port) = urllib.splitport(host)
621         if type == "https":
622             if port == None:
623                 port = 443
624         elif port == None:
625             port = 80
626         try:
627             conn = None
628             if type == "http":
629                 conn = httplib.HTTPConnection("%s:%s" %(host, port))
630             else:
631                 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
632             conn.request(method, path)
633             response = conn.getresponse()
634             if response.status in [301, 302, 303, 307]:
635                 headers = response.getheaders()
636                 for header in headers:
637                     if header[0] == "location":
638                         url = header[1]
639             elif response.status == 200:
640                 return response
641         except:
642             pass
643         redirectcount = redirectcount + 1
644     return None
645
646 def parse_and_deliver(maildir, url, statedir):
647     feedhandle = None
648     headers = None
649     # first check if we know about this feed already
650     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
651     if feeddb.has_key(url):
652         data = feeddb[url]
653         data = cgi.parse_qs(data)
654         response = open_url("HEAD", url)
655         headers = None
656         if response:
657             headers = response.getheaders()
658         ischanged = False
659         try:
660             for header in headers:
661                 if header[0] == "content-length":
662                     if header[1] != data["content-length"][0]:
663                         ischanged = True
664                 elif header[0] == "etag":
665                     if header[1] != data["etag"][0]:
666                         ischanged = True
667                 elif header[0] == "last-modified":
668                     if header[1] != data["last-modified"][0]:
669                         ischanged = True
670                 elif header[0] == "content-md5":
671                     if header[1] != data["content-md5"][0]:
672                         ischanged = True
673         except:
674             ischanged = True
675         if ischanged:
676             response = open_url("GET", url)
677             if response != None:
678                 headers = response.getheaders()
679                 feedhandle = response
680             else:
681                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
682                 return
683         else:
684             return # don't need to do anything, nothings changed.
685     else:
686         response = open_url("GET", url)
687         if response != None:
688             headers = response.getheaders()
689             feedhandle = response
690         else:
691             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
692             return
693
694     fp = feedparser.parse(feedhandle)
695     db = dbm.open(os.path.join(statedir, "seen"), "c")
696     for item in fp["items"]:
697         # have we seen it before?
698         # need to work out what the content is first...
699
700         if item.has_key("content"):
701             content = item["content"][0]["value"]
702         else:
703             if item.has_key("description"):
704                 content = item["description"]
705             else:
706                 content = u''
707
708         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
709
710         prevmessageid = None
711
712         db_guid_key = None
713         db_link_key = (url + u'|' + item["link"]).encode("utf-8")
714
715         # check if there's a guid too - if that exists and we match the md5,
716         # return
717         if item.has_key("guid"):
718             db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
719             if db.has_key(db_guid_key):
720                 data = db[db_guid_key]
721                 data = cgi.parse_qs(data)
722                 if data["contentmd5"][0] == md5sum:
723                     continue
724
725         if db.has_key(db_link_key):
726             data = db[db_link_key]
727             data = cgi.parse_qs(data)
728             if data.has_key("message-id"):
729                 prevmessageid = data["message-id"][0]
730             if data["contentmd5"][0] == md5sum:
731                 continue
732
733         try:
734             author = item["author"]
735         except:
736             author = url
737
738         # create a basic email message
739         msg = MIMEMultipart("alternative")
740         messageid = "<" \
741             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
742             + "." \
743             + "".join( \
744                 [random.choice( \
745                     string.ascii_letters + string.digits \
746                     ) for a in range(0,6) \
747                 ]) + "@" + socket.gethostname() + ">"
748         msg.add_header("Message-ID", messageid)
749         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
750         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
751         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
752         if prevmessageid:
753             msg.add_header("References", prevmessageid)
754         createddate = datetime.datetime.now() \
755             .strftime("%a, %e %b %Y %T -0000")
756         try:
757             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
758                 .strftime("%a, %e %b %Y %T -0000")
759         except:
760             pass
761         msg.add_header("Date", createddate)
762         msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
763             .strftime("%a, %e %b %Y %T -0000"))
764         subj_gen = HTML2Text()
765         title = item["title"]
766         title = re.sub(u'<', u'&lt;', title)
767         title = re.sub(u'>', u'&gt;', title)
768         subj_gen.feed(title.encode("utf-8"))
769         msg.add_header("Subject", subj_gen.gettext())
770         msg.set_default_type("text/plain")
771
772         htmlcontent = content.encode("utf-8")
773         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
774             content, \
775             item["link"], \
776             item["link"] )
777         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
778         textparser = HTML2Text()
779         textparser.feed(content.encode("utf-8"))
780         textcontent = textparser.gettext()
781         textcontent = "%s\n\nItem URL: %s" %( \
782             textcontent, \
783             item["link"] )
784         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
785         msg.attach(textpart)
786         msg.attach(htmlpart)
787
788         # start by working out the filename we should be writting to, we do
789         # this following the normal maildir style rules
790         fname = str(os.getpid()) \
791             + "." + socket.gethostname() \
792             + "." + "".join( \
793                 [random.choice( \
794                     string.ascii_letters + string.digits \
795                     ) for a in range(0,10) \
796                 ]) + "." \
797             + datetime.datetime.now().strftime('%s')
798         fn = os.path.join(maildir, "tmp", fname)
799         fh = open(fn, "w")
800         fh.write(msg.as_string())
801         fh.close()
802         # now move it in to the new directory
803         newfn = os.path.join(maildir, "new", fname)
804         os.link(fn, newfn)
805         os.unlink(fn)
806
807         # now add to the database about the item
808         if prevmessageid:
809             messageid = prevmessageid + " " + messageid
810         if item.has_key("guid") and item["guid"] != item["link"]:
811             data = urllib.urlencode(( \
812                 ("message-id", messageid), \
813                 ("created", createddate), \
814                 ("contentmd5", md5sum) \
815                 ))
816             db[db_guid_key] = data
817             try:
818                 data = db[db_link_key]
819                 data = cgi.parse_qs(data)
820                 newdata = urllib.urlencode(( \
821                     ("message-id", messageid), \
822                     ("created", data["created"][0]), \
823                     ("contentmd5", data["contentmd5"][0]) \
824                     ))
825                 db[db_link_key] = newdata
826             except:
827                 db[db_link_key] = data
828         else:
829             data = urllib.urlencode(( \
830                 ("message-id", messageid), \
831                 ("created", createddate), \
832                 ("contentmd5", md5sum) \
833                 ))
834             db[db_link_key] = data
835
836     if headers:
837         data = []
838         for header in headers:
839             if header[0] in \
840                 ["content-md5", "etag", "last-modified", "content-length"]:
841                 data.append((header[0], header[1]))
842         if len(data) > 0:
843             data = urllib.urlencode(data)
844             feeddb[url] = data
845
846     db.close()
847     feeddb.close()
848
849 if __name__ == "__main__":
850     # This only gets executed if we really called the program
851     # first off, parse the command line arguments
852
853     oparser = OptionParser()
854     oparser.add_option(
855         "-c", "--conf", dest="conf",
856         help="location of config file"
857         )
858     oparser.add_option(
859         "-s", "--statedir", dest="statedir",
860         help="location of directory to store state in"
861         )
862
863     (options, args) = oparser.parse_args()
864
865     # check for the configfile
866
867     configfile = None
868
869     if options.conf != None:
870         # does the file exist?
871         try:
872             os.stat(options.conf)
873             configfile = options.conf
874         except:
875             # should exit here as the specified file doesn't exist
876             sys.stderr.write( \
877                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
878             sys.exit(2)
879     else:
880         # check through the default locations
881         try:
882             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
883             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
884         except:
885             try:
886                 os.stat("/etc/rss2maildir.conf")
887                 configfile = "/etc/rss2maildir.conf"
888             except:
889                 sys.stderr.write("No config file found. Exiting.\n")
890                 sys.exit(2)
891
892     # Right - if we've got this far, we've got a config file, now for the hard
893     # bits...
894
895     scp = SafeConfigParser()
896     scp.read(configfile)
897
898     maildir_root = "RSSMaildir"
899     state_dir = "state"
900
901     if options.statedir != None:
902         state_dir = options.statedir
903         try:
904             mode = os.stat(state_dir)[stat.ST_MODE]
905             if not stat.S_ISDIR(mode):
906                 sys.stderr.write( \
907                     "State directory (%s) is not a directory\n" %(state_dir))
908                 sys.exit(1)
909         except:
910             # try to make the directory
911             try:
912                 os.mkdir(state_dir)
913             except:
914                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
915                 sys.exit(1)
916     elif scp.has_option("general", "state_dir"):
917         new_state_dir = scp.get("general", "state_dir")
918         try:
919             mode = os.stat(new_state_dir)[stat.ST_MODE]
920             if not stat.S_ISDIR(mode):
921                 sys.stderr.write( \
922                     "State directory (%s) is not a directory\n" %(state_dir))
923                 sys.exit(1)
924             else:
925                 state_dir = new_state_dir
926         except:
927             # try to create it
928             try:
929                 os.mkdir(new_state_dir)
930                 state_dir = new_state_dir
931             except:
932                 sys.stderr.write( \
933                     "Couldn't create state directory %s\n" %(new_state_dir))
934                 sys.exit(1)
935     else:
936         try:
937             mode = os.stat(state_dir)[stat.ST_MODE]
938             if not stat.S_ISDIR(mode):
939                 sys.stderr.write( \
940                     "State directory %s is not a directory\n" %(state_dir))
941                 sys.exit(1)
942         except:
943             try:
944                 os.mkdir(state_dir)
945             except:
946                 sys.stderr.write( \
947                     "State directory %s could not be created\n" %(state_dir))
948                 sys.exit(1)
949
950     if scp.has_option("general", "maildir_root"):
951         maildir_root = scp.get("general", "maildir_root")
952
953     try:
954         mode = os.stat(maildir_root)[stat.ST_MODE]
955         if not stat.S_ISDIR(mode):
956             sys.stderr.write( \
957                 "Maildir Root %s is not a directory\n" \
958                 %(maildir_root))
959             sys.exit(1)
960     except:
961         try:
962             os.mkdir(maildir_root)
963         except:
964             sys.stderr.write("Couldn't create Maildir Root %s\n" \
965                 %(maildir_root))
966             sys.exit(1)
967
968     feeds = scp.sections()
969     try:
970         feeds.remove("general")
971     except:
972         pass
973
974     for section in feeds:
975         # check if the directory exists
976         maildir = None
977         try:
978             maildir = scp.get(section, "maildir")
979         except:
980             maildir = section
981
982         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
983         maildir = os.path.join(maildir_root, maildir)
984
985         try:
986             exists = os.stat(maildir)
987             if stat.S_ISDIR(exists[stat.ST_MODE]):
988                 # check if there's a new, cur and tmp directory
989                 try:
990                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
991                 except:
992                     os.mkdir(os.path.join(maildir, "cur"))
993                     if not stat.S_ISDIR(mode):
994                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
995                 try:
996                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
997                 except:
998                     os.mkdir(os.path.join(maildir, "tmp"))
999                     if not stat.S_ISDIR(mode):
1000                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1001                 try:
1002                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
1003                     if not stat.S_ISDIR(mode):
1004                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
1005                 except:
1006                     os.mkdir(os.path.join(maildir, "new"))
1007             else:
1008                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1009         except:
1010             try:
1011                 os.mkdir(maildir)
1012             except:
1013                 sys.stderr.write("Couldn't create root maildir %s\n" \
1014                     %(maildir))
1015                 sys.exit(1)
1016             try:
1017                 os.mkdir(os.path.join(maildir, "new"))
1018                 os.mkdir(os.path.join(maildir, "cur"))
1019                 os.mkdir(os.path.join(maildir, "tmp"))
1020             except:
1021                 sys.stderr.write( \
1022                     "Couldn't create required maildir directories for %s\n" \
1023                     %(section,))
1024                 sys.exit(1)
1025
1026         # right - we've got the directories, we've got the section, we know the
1027         # url... lets play!
1028
1029         parse_and_deliver(maildir, section, state_dir)