Add (lots) more basic HTML entities.
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         }
141
142     blockleveltags = [
143         u'h1',
144         u'h2',
145         u'h3',
146         u'h4',
147         u'h5',
148         u'h6',
149         u'pre',
150         u'p',
151         u'ul',
152         u'ol',
153         u'dl',
154         u'li',
155         u'dt',
156         u'dd',
157         u'div',
158         #u'blockquote',
159         ]
160
161     liststarttags = [
162         u'ul',
163         u'ol',
164         u'dl',
165         ]
166
167     cancontainflow = [
168         u'div',
169         u'li',
170         u'dd',
171         u'blockquote',
172     ]
173
174     def __init__(self,textwidth=70):
175         self.text = u''
176         self.curdata = u''
177         self.textwidth = textwidth
178         self.opentags = []
179         self.indentlevel = 0
180         self.ignorenodata = False
181         self.listcount = []
182         self.urls = []
183         self.images = {}
184         HTMLParser.__init__(self)
185
186     def handle_starttag(self, tag, attrs):
187         tag_name = tag.lower()
188         if tag_name in self.blockleveltags:
189             # handle starting a new block - unless we're in a block element
190             # that can contain other blocks, we'll assume that we want to close
191             # the container
192             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193                 self.handle_curdata()
194
195             if tag_name == u'ol':
196                 self.handle_curdata()
197                 self.listcount.append(1)
198                 self.listlevel = len(self.listcount) - 1
199
200             if tag_name in self.liststarttags:
201                 smallist = self.opentags[-3:-1]
202                 smallist.reverse()
203                 for prev_listtag in smallist:
204                     if prev_listtag in [u'dl', u'ol']:
205                         self.indentlevel = self.indentlevel + 4
206                         break
207                     elif prev_listtag == u'ul':
208                         self.indentlevel = self.indentlevel + 3
209                         break
210
211             if len(self.opentags) > 0:
212                 self.handle_curdata()
213                 if tag_name not in self.cancontainflow:
214                     self.opentags.pop()
215             self.opentags.append(tag_name)
216         else:
217             if tag_name == "span":
218                 return
219             listcount = 0
220             try:
221                 listcount = self.listcount[-1]
222             except:
223                 pass
224
225             if tag_name == u'dd' and len(self.opentags) > 1 \
226                 and self.opentags[-1] == u'dt':
227                 self.handle_curdata()
228                 self.opentags.pop()
229             elif tag_name == u'dt' and len(self.opentags) > 1 \
230                 and self.opentags[-1] == u'dd':
231                 self.handle_curdata()
232                 self.opentags.pop()
233             elif tag_name == u'a':
234                 for attr in attrs:
235                     if attr[0].lower() == u'href':
236                         self.urls.append(attr[1].decode('utf-8'))
237                 self.curdata = self.curdata + u'`'
238                 self.opentags.append(tag_name)
239                 return
240             elif tag_name == u'img':
241                 self.handle_image(attrs)
242                 return
243             elif tag_name == u'br':
244                 self.handle_br()
245                 return
246             else:
247                 # we don't know the tag, so lets avoid handling it!
248                 return 
249
250     def handle_startendtag(self, tag, attrs):
251         if tag.lower() == u'br':
252             self.handle_br()
253         elif tag.lower() == u'img':
254             self.handle_image(attrs)
255             return
256
257     def handle_br(self):
258             self.handle_curdata()
259             self.opentags.append(u'br')
260             self.handle_curdata()
261             self.opentags.pop()
262
263     def handle_image(self, attrs):
264         alt = u''
265         url = u''
266         for attr in attrs:
267             if attr[0] == 'alt':
268                 alt = attr[1].decode('utf-8')
269             elif attr[0] == 'src':
270                 url = attr[1].decode('utf-8')
271         if url:
272             if alt:
273                 if self.images.has_key(alt):
274                     if self.images[alt]["url"] == url:
275                         self.curdata = self.curdata \
276                             + u'|%s|' %(alt,)
277                     else:
278                         while self.images.has_key(alt):
279                             alt = alt + "_"
280                         self.images[alt]["url"] = url
281                         self.curdata = self.curdata \
282                             + u'|%s|' %(alt,)
283                 else:
284                     self.images[alt] = {}
285                     self.images[alt]["url"] = url
286                     self.curdata = self.curdata \
287                         + u'|%s|' %(alt,)
288             else:
289                 if self.images.has_key(url):
290                     self.curdata = self.curdata \
291                         + u'|%s|' %(url,)
292                 else:
293                     self.images[url] = {}
294                     self.images[url]["url"] =url
295                     self.curdata = self.curdata \
296                         + u'|%s|' %(url,)
297
298     def handle_curdata(self):
299
300         if len(self.opentags) == 0:
301             return
302
303         tag_thats_done = self.opentags[-1]
304
305         if len(self.curdata) == 0:
306             return
307
308         if tag_thats_done == u'br':
309             if len(self.text) == 0 or self.text[-1] != '\n':
310                 self.text = self.text + '\n'
311                 self.ignorenodata = True
312             return
313
314         if len(self.curdata.strip()) == 0:
315             return
316
317         if tag_thats_done in self.blockleveltags:
318             newlinerequired = self.text != u''
319             if self.ignorenodata:
320                 newlinerequired = False
321             self.ignorenodata = False
322             if newlinerequired:
323                 if tag_thats_done in [u'dt', u'dd', u'li'] \
324                     and len(self.text) > 1 \
325                     and self.text[-1] != u'\n':
326                         self.text = self.text + u'\n'
327                 elif len(self.text) > 2 \
328                     and self.text[-1] != u'\n' \
329                     and self.text[-2] != u'\n':
330                     self.text = self.text + u'\n\n'
331
332         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
333             underline = u''
334             underlinechar = u'='
335             headingtext = " ".join(self.curdata.split())
336             seperator = u'\n' + u' '*self.indentlevel
337             headingtext = seperator.join( \
338                 textwrap.wrap( \
339                     headingtext, \
340                     self.textwidth - self.indentlevel \
341                     ) \
342                 )
343
344             if tag_thats_done == u'h2':
345                 underlinechar = u'-'
346             elif tag_thats_done != u'h1':
347                 underlinechar = u'~'
348
349             if u'\n' in headingtext:
350                 underline = u' ' * self.indentlevel \
351                     + underlinechar * (self.textwidth - self.indentlevel)
352             else:
353                 underline = u' ' * self.indentlevel \
354                     + underlinechar * len(headingtext)
355             self.text = self.text \
356                 + headingtext + u'\n' \
357                 + underline
358         elif tag_thats_done in [u'p', u'div']:
359             paragraph = unicode( \
360                 " ".join(self.curdata.strip().encode("utf-8").split()), \
361                 "utf-8")
362             seperator = u'\n' + u' ' * self.indentlevel
363             self.text = self.text \
364                 + u' ' * self.indentlevel \
365                 + seperator.join( \
366                     textwrap.wrap( \
367                         paragraph, self.textwidth - self.indentlevel))
368         elif tag_thats_done == "pre":
369             self.text = self.text + unicode( \
370                 self.curdata.encode("utf-8"), "utf-8")
371         elif tag_thats_done == u'blockquote':
372             quote = unicode( \
373                 " ".join(self.curdata.encode("utf-8").strip().split()), \
374                 "utf-8")
375             seperator = u'\n' + u' ' * self.indentlevel + u'> '
376             if len(self.text) > 0 and self.text[-1] != u'\n':
377                 self.text = self.text + u'\n'
378             self.text = self.text \
379                 + u'> ' \
380                 + seperator.join( \
381                     textwrap.wrap( \
382                         quote, \
383                         self.textwidth - self.indentlevel - 2 \
384                     )
385                 )
386             self.curdata = u''
387         elif tag_thats_done == "li":
388             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
389             if len(self.text) > 0 and self.text[-1] != u'\n':
390                 self.text = self.text + u'\n'
391             # work out if we're in an ol rather than a ul
392             latesttags = self.opentags[-4:]
393             latesttags.reverse()
394             isul = None
395             for thing in latesttags:
396                 if thing == 'ul':
397                     isul = True
398                     break
399                 elif thing == 'ol':
400                     isul = False
401                     break
402
403             listindent = 3
404             if not isul:
405                 listindent = 4
406
407             listmarker = u' * '
408             if isul == False:
409                 listmarker = u' %2d. ' %(self.listcount[-1])
410                 self.listcount[-1] = self.listcount[-1] + 1
411
412             seperator = u'\n' \
413                 + u' ' * self.indentlevel \
414                 + u' ' * listindent
415             self.text = self.text \
416                 + u' ' * self.indentlevel \
417                 + listmarker \
418                 + seperator.join( \
419                     textwrap.wrap( \
420                         item, \
421                         self.textwidth - self.indentlevel - listindent \
422                     ) \
423                 )
424             self.curdata = u''
425         elif tag_thats_done == u'dt':
426             definition = unicode(" ".join( \
427                     self.curdata.encode("utf-8").strip().split()), \
428                 "utf-8")
429             if len(self.text) > 0 and self.text[-1] != u'\n':
430                 self.text = self.text + u'\n\n'
431             elif len(self.text) > 1 and self.text[-2] != u'\n':
432                 self.text = self.text + u'\n'
433             definition = u' ' * self.indentlevel + definition + "::"
434             indentstring = u'\n' + u' ' * (self.indentlevel + 1)
435             self.text = self.text \
436                 + indentstring.join(
437                     textwrap.wrap(definition, \
438                         self.textwidth - self.indentlevel - 1))
439             self.curdata = u''
440         elif tag_thats_done == u'dd':
441             definition = unicode(" ".join( \
442                     self.curdata.encode("utf-8").strip().split()),
443                 "utf-8")
444             if len(definition) > 0:
445                 if len(self.text) > 0 and self.text[-1] != u'\n':
446                     self.text = self.text + u'\n'
447                 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
448                 self.text = self.text \
449                     + u' ' * (self.indentlevel + 4) \
450                     + indentstring.join( \
451                         textwrap.wrap( \
452                             definition, \
453                             self.textwidth - self.indentlevel - 4 \
454                             ) \
455                         )
456                 self.curdata = u''
457         elif tag_thats_done == u'a':
458             self.curdata = self.curdata + u'`__'
459             pass
460         elif tag_thats_done in self.liststarttags:
461             pass
462
463         if tag_thats_done in self.blockleveltags:
464             self.curdata = u''
465
466         self.ignorenodata = False
467
468     def handle_endtag(self, tag):
469         self.ignorenodata = False
470         if tag == "span":
471             return
472
473         try:
474             tagindex = self.opentags.index(tag)
475         except:
476             return
477         tag = tag.lower()
478
479         if tag in [u'br', u'img']:
480             return
481
482         if tag in self.liststarttags:
483             if tag in [u'ol', u'dl', u'ul']:
484                 self.handle_curdata()
485                 # find if there was a previous list level
486                 smalllist = self.opentags[:-1]
487                 smalllist.reverse()
488                 for prev_listtag in smalllist:
489                     if prev_listtag in [u'ol', u'dl']:
490                         self.indentlevel = self.indentlevel - 4
491                         break
492                     elif prev_listtag == u'ul':
493                         self.indentlevel = self.indentlevel - 3
494                         break
495
496         if tag == u'ol':
497             self.listcount = self.listcount[:-1]
498
499         while tagindex < len(self.opentags) \
500             and tag in self.opentags[tagindex+1:]:
501             try:
502                 tagindex = self.opentags.index(tag, tagindex+1)
503             except:
504                 # well, we don't want to do that then
505                 pass
506         if tagindex != len(self.opentags) - 1:
507             # Assuming the data was for the last opened tag first
508             self.handle_curdata()
509             # Now kill the list to be a slice before this tag was opened
510             self.opentags = self.opentags[:tagindex + 1]
511         else:
512             self.handle_curdata()
513             if self.opentags[-1] == tag:
514                 self.opentags.pop()
515
516     def handle_data(self, data):
517         if len(self.opentags) == 0:
518             self.opentags.append(u'p')
519         self.curdata = self.curdata + data.decode("utf-8")
520
521     def handle_entityref(self, name):
522         entity = name
523         if HTML2Text.entities.has_key(name.lower()):
524             entity = HTML2Text.entities[name.lower()]
525         elif name[0] == "#":
526             entity = unichr(int(name[1:]))
527         else:
528             entity = "&" + name + ";"
529
530         self.curdata = self.curdata + unicode(entity, "utf-8")
531
532     def gettext(self):
533         self.handle_curdata()
534         if len(self.text) == 0 or self.text[-1] != u'\n':
535             self.text = self.text + u'\n'
536         self.opentags = []
537         if len(self.text) > 0:
538             while len(self.text) > 1 and self.text[-1] == u'\n':
539                 self.text = self.text[:-1]
540             self.text = self.text + u'\n'
541         if len(self.urls) > 0:
542             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
543             self.urls = []
544         if len(self.images.keys()) > 0:
545             self.text = self.text + u'\n.. ' \
546                 + u'.. '.join( \
547                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
548                 for a in self.images.keys()]) + u'\n'
549             self.images = {}
550         return self.text
551
552 def open_url(method, url):
553     redirectcount = 0
554     while redirectcount < 3:
555         (type, rest) = urllib.splittype(url)
556         (host, path) = urllib.splithost(rest)
557         (host, port) = urllib.splitport(host)
558         if port == None:
559             port = 80
560         try:
561             conn = httplib.HTTPConnection("%s:%s" %(host, port))
562             conn.request(method, path)
563             response = conn.getresponse()
564             if response.status in [301, 302, 303, 307]:
565                 headers = response.getheaders()
566                 for header in headers:
567                     if header[0] == "location":
568                         url = header[1]
569             elif response.status == 200:
570                 return response
571         except:
572             pass
573         redirectcount = redirectcount + 1
574     return None
575
576 def parse_and_deliver(maildir, url, statedir):
577     feedhandle = None
578     headers = None
579     # first check if we know about this feed already
580     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
581     if feeddb.has_key(url):
582         data = feeddb[url]
583         data = cgi.parse_qs(data)
584         response = open_url("HEAD", url)
585         headers = None
586         if response:
587             headers = response.getheaders()
588         ischanged = False
589         try:
590             for header in headers:
591                 if header[0] == "content-length":
592                     if header[1] != data["content-length"][0]:
593                         ischanged = True
594                 elif header[0] == "etag":
595                     if header[1] != data["etag"][0]:
596                         ischanged = True
597                 elif header[0] == "last-modified":
598                     if header[1] != data["last-modified"][0]:
599                         ischanged = True
600                 elif header[0] == "content-md5":
601                     if header[1] != data["content-md5"][0]:
602                         ischanged = True
603         except:
604             ischanged = True
605         if ischanged:
606             response = open_url("GET", url)
607             if response != None:
608                 headers = response.getheaders()
609                 feedhandle = response
610             else:
611                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
612                 return
613         else:
614             return # don't need to do anything, nothings changed.
615     else:
616         response = open_url("GET", url)
617         if response != None:
618             headers = response.getheaders()
619             feedhandle = response
620         else:
621             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
622             return
623
624     fp = feedparser.parse(feedhandle)
625     db = dbm.open(os.path.join(statedir, "seen"), "c")
626     for item in fp["items"]:
627         # have we seen it before?
628         # need to work out what the content is first...
629
630         if item.has_key("content"):
631             content = item["content"][0]["value"]
632         else:
633             content = item["summary"]
634
635         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
636
637         prevmessageid = None
638
639         # check if there's a guid too - if that exists and we match the md5,
640         # return
641         if item.has_key("guid"):
642             if db.has_key(url + "|" + item["guid"]):
643                 data = db[url + "|" + item["guid"]]
644                 data = cgi.parse_qs(data)
645                 if data["contentmd5"][0] == md5sum:
646                     continue
647
648         if db.has_key(url + "|" + item["link"]):
649             data = db[url + "|" + item["link"]]
650             data = cgi.parse_qs(data)
651             if data.has_key("message-id"):
652                 prevmessageid = data["message-id"][0]
653             if data["contentmd5"][0] == md5sum:
654                 continue
655
656         try:
657             author = item["author"]
658         except:
659             author = url
660
661         # create a basic email message
662         msg = MIMEMultipart("alternative")
663         messageid = "<" \
664             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
665             + "." \
666             + "".join( \
667                 [random.choice( \
668                     string.ascii_letters + string.digits \
669                     ) for a in range(0,6) \
670                 ]) + "@" + socket.gethostname() + ">"
671         msg.add_header("Message-ID", messageid)
672         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
673         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
674         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
675         if prevmessageid:
676             msg.add_header("References", prevmessageid)
677         createddate = datetime.datetime.now() \
678             .strftime("%a, %e %b %Y %T -0000")
679         try:
680             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
681                 .strftime("%a, %e %b %Y %T -0000")
682         except:
683             pass
684         msg.add_header("Date", createddate)
685         msg.add_header("Subject", item["title"])
686         msg.set_default_type("text/plain")
687
688         htmlcontent = content.encode("utf-8")
689         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
690             content, \
691             item["link"], \
692             item["link"] )
693         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
694         textparser = HTML2Text()
695         textparser.feed(content.encode("utf-8"))
696         textcontent = textparser.gettext()
697         textcontent = "%s\n\nItem URL: %s" %( \
698             textcontent, \
699             item["link"] )
700         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
701         msg.attach(textpart)
702         msg.attach(htmlpart)
703
704         # start by working out the filename we should be writting to, we do
705         # this following the normal maildir style rules
706         fname = str(os.getpid()) \
707             + "." + socket.gethostname() \
708             + "." + "".join( \
709                 [random.choice( \
710                     string.ascii_letters + string.digits \
711                     ) for a in range(0,10) \
712                 ]) + "." \
713             + datetime.datetime.now().strftime('%s')
714         fn = os.path.join(maildir, "tmp", fname)
715         fh = open(fn, "w")
716         fh.write(msg.as_string())
717         fh.close()
718         # now move it in to the new directory
719         newfn = os.path.join(maildir, "new", fname)
720         os.link(fn, newfn)
721         os.unlink(fn)
722
723         # now add to the database about the item
724         if prevmessageid:
725             messageid = prevmessageid + " " + messageid
726         if item.has_key("guid") and item["guid"] != item["link"]:
727             data = urllib.urlencode(( \
728                 ("message-id", messageid), \
729                 ("created", createddate), \
730                 ("contentmd5", md5sum) \
731                 ))
732             db[url + "|" + item["guid"]] = data
733             try:
734                 data = db[url + "|" + item["link"]]
735                 data = cgi.parse_qs(data)
736                 newdata = urllib.urlencode(( \
737                     ("message-id", messageid), \
738                     ("created", data["created"][0]), \
739                     ("contentmd5", data["contentmd5"][0]) \
740                     ))
741                 db[url + "|" + item["link"]] = newdata
742             except:
743                 db[url + "|" + item["link"]] = data
744         else:
745             data = urllib.urlencode(( \
746                 ("message-id", messageid), \
747                 ("created", createddate), \
748                 ("contentmd5", md5sum) \
749                 ))
750             db[url + "|" + item["link"]] = data
751
752     if headers:
753         data = []
754         for header in headers:
755             if header[0] in \
756                 ["content-md5", "etag", "last-modified", "content-length"]:
757                 data.append((header[0], header[1]))
758         if len(data) > 0:
759             data = urllib.urlencode(data)
760             feeddb[url] = data
761
762     db.close()
763     feeddb.close()
764
765 if __name__ == "__main__":
766     # This only gets executed if we really called the program
767     # first off, parse the command line arguments
768
769     oparser = OptionParser()
770     oparser.add_option(
771         "-c", "--conf", dest="conf",
772         help="location of config file"
773         )
774     oparser.add_option(
775         "-s", "--statedir", dest="statedir",
776         help="location of directory to store state in"
777         )
778
779     (options, args) = oparser.parse_args()
780
781     # check for the configfile
782
783     configfile = None
784
785     if options.conf != None:
786         # does the file exist?
787         try:
788             os.stat(options.conf)
789             configfile = options.conf
790         except:
791             # should exit here as the specified file doesn't exist
792             sys.stderr.write( \
793                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
794             sys.exit(2)
795     else:
796         # check through the default locations
797         try:
798             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
799             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
800         except:
801             try:
802                 os.stat("/etc/rss2maildir.conf")
803                 configfile = "/etc/rss2maildir.conf"
804             except:
805                 sys.stderr.write("No config file found. Exiting.\n")
806                 sys.exit(2)
807
808     # Right - if we've got this far, we've got a config file, now for the hard
809     # bits...
810
811     scp = SafeConfigParser()
812     scp.read(configfile)
813
814     maildir_root = "RSSMaildir"
815     state_dir = "state"
816
817     if options.statedir != None:
818         state_dir = options.statedir
819         try:
820             mode = os.stat(state_dir)[stat.ST_MODE]
821             if not stat.S_ISDIR(mode):
822                 sys.stderr.write( \
823                     "State directory (%s) is not a directory\n" %(state_dir))
824                 sys.exit(1)
825         except:
826             # try to make the directory
827             try:
828                 os.mkdir(state_dir)
829             except:
830                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
831                 sys.exit(1)
832     elif scp.has_option("general", "state_dir"):
833         new_state_dir = scp.get("general", "state_dir")
834         try:
835             mode = os.stat(new_state_dir)[stat.ST_MODE]
836             if not stat.S_ISDIR(mode):
837                 sys.stderr.write( \
838                     "State directory (%s) is not a directory\n" %(state_dir))
839                 sys.exit(1)
840             else:
841                 state_dir = new_state_dir
842         except:
843             # try to create it
844             try:
845                 os.mkdir(new_state_dir)
846                 state_dir = new_state_dir
847             except:
848                 sys.stderr.write( \
849                     "Couldn't create state directory %s\n" %(new_state_dir))
850                 sys.exit(1)
851     else:
852         try:
853             mode = os.stat(state_dir)[stat.ST_MODE]
854             if not stat.S_ISDIR(mode):
855                 sys.stderr.write( \
856                     "State directory %s is not a directory\n" %(state_dir))
857                 sys.exit(1)
858         except:
859             try:
860                 os.mkdir(state_dir)
861             except:
862                 sys.stderr.write( \
863                     "State directory %s could not be created\n" %(state_dir))
864                 sys.exit(1)
865
866     if scp.has_option("general", "maildir_root"):
867         maildir_root = scp.get("general", "maildir_root")
868
869     try:
870         mode = os.stat(maildir_root)[stat.ST_MODE]
871         if not stat.S_ISDIR(mode):
872             sys.stderr.write( \
873                 "Maildir Root %s is not a directory\n" \
874                 %(maildir_root))
875             sys.exit(1)
876     except:
877         try:
878             os.mkdir(maildir_root)
879         except:
880             sys.stderr.write("Couldn't create Maildir Root %s\n" \
881                 %(maildir_root))
882             sys.exit(1)
883
884     feeds = scp.sections()
885     try:
886         feeds.remove("general")
887     except:
888         pass
889
890     for section in feeds:
891         # check if the directory exists
892         maildir = None
893         try:
894             maildir = scp.get(section, "maildir")
895         except:
896             maildir = section
897
898         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
899         maildir = os.path.join(maildir_root, maildir)
900
901         try:
902             exists = os.stat(maildir)
903             if stat.S_ISDIR(exists[stat.ST_MODE]):
904                 # check if there's a new, cur and tmp directory
905                 try:
906                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
907                 except:
908                     os.mkdir(os.path.join(maildir, "cur"))
909                     if not stat.S_ISDIR(mode):
910                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
911                 try:
912                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
913                 except:
914                     os.mkdir(os.path.join(maildir, "tmp"))
915                     if not stat.S_ISDIR(mode):
916                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
917                 try:
918                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
919                     if not stat.S_ISDIR(mode):
920                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
921                 except:
922                     os.mkdir(os.path.join(maildir, "new"))
923             else:
924                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
925         except:
926             try:
927                 os.mkdir(maildir)
928             except:
929                 sys.stderr.write("Couldn't create root maildir %s\n" \
930                     %(maildir))
931                 sys.exit(1)
932             try:
933                 os.mkdir(os.path.join(maildir, "new"))
934                 os.mkdir(os.path.join(maildir, "cur"))
935                 os.mkdir(os.path.join(maildir, "tmp"))
936             except:
937                 sys.stderr.write( \
938                     "Couldn't create required maildir directories for %s\n" \
939                     %(section,))
940                 sys.exit(1)
941
942         # right - we've got the directories, we've got the section, we know the
943         # url... lets play!
944
945         parse_and_deliver(maildir, section, state_dir)