]> git.sommitrealweird.co.uk Git - rss2maildir.git/blob - rss2maildir.py
More entities
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         u'amp': u'&',
52         u'lt': u'<',
53         u'gt': u'>',
54         u'pound': u'£',
55         u'copy': u'©',
56         u'apos': u'\'',
57         u'quot': u'"',
58         u'nbsp': u' ',
59         u'ldquo': u'“',
60         u'rdquo': u'”',
61         u'lsquo': u'‘',
62         u'rsquo': u'’',
63         u'laquo': u'«',
64         u'raquo': u'»',
65         u'lsaquo': u'‹',
66         u'rsaquo': u'›',
67         u'bull': u'•',
68         u'middot': u'·',
69         u'deg': u'°',
70         u'helip': u'…',
71         u'trade': u'™',
72         u'reg': u'®',
73         u'agrave': u'à',
74         u'Agrave': u'À',
75         u'egrave': u'è',
76         u'Egrave': u'È',
77         u'igrave': u'ì',
78         u'Igrave': u'Ì',
79         u'ograve': u'ò',
80         u'Ograve': u'Ò',
81         u'ugrave': u'ù',
82         u'Ugrave': u'Ù',
83         u'aacute': u'á',
84         u'Aacute': u'Á',
85         u'eacute': u'é',
86         u'Eacute': u'É',
87         u'iacute': u'í',
88         u'Iacute': u'Í',
89         u'oacute': u'ó',
90         u'Oacute': u'Ó',
91         u'uacute': u'ú',
92         u'Uacute': u'Ú',
93         u'yactue': u'ý',
94         u'Yacute': u'Ý',
95         u'acirc': u'â',
96         u'Acirc': u'Â',
97         u'ecirc': u'ê',
98         u'Ecirc': u'Ê',
99         u'icirc': u'î',
100         u'Icirc': u'Î',
101         u'ocirc': u'ô',
102         u'Ocirc': u'Ô',
103         u'ucirc': u'û',
104         u'Ucirc': u'Û',
105         u'atilde': u'ã',
106         u'Atilde': u'Ã',
107         u'ntilde': u'ñ',
108         u'Ntilde': u'Ñ',
109         u'otilde': u'õ',
110         u'Otilde': u'Õ',
111         u'auml': u'ä',
112         u'Auml': u'Ä',
113         u'euml': u'ë',
114         u'Euml': u'Ë',
115         u'iuml': u'ï',
116         u'Iuml': u'Ï',
117         u'ouml': u'ö',
118         u'Ouml': u'Ö',
119         u'uuml': u'ü',
120         u'Uuml': u'Ü',
121         u'yuml': u'ÿ',
122         u'Yuml': u'Ÿ',
123         u'iexcl': u'¡',
124         u'iquest': u'¿',
125         u'ccedil': u'ç',
126         u'Ccedil': u'Ç',
127         u'oelig': u'œ',
128         u'OElig': u'Œ',
129         u'szlig': u'ß',
130         u'oslash': u'ø',
131         u'Oslash': u'Ø',
132         u'aring': u'å',
133         u'Aring': u'Å',
134         u'aelig': u'æ',
135         u'AElig': u'Æ',
136         u'thorn': u'þ',
137         u'THORN': u'Þ',
138         u'eth': u'ð',
139         u'ETH': u'Ð',
140         u'mdash': u'—',
141         u'ndash': u'–',
142         u'sect': u'§',
143         u'para': u'¶',
144         u'uarr': u'↑',
145         u'darr': u'↓',
146         u'larr': u'←',
147         u'rarr': u'→',
148         u'dagger': u'†',
149         u'Dagger': u'‡',
150         u'permil': u'‰',
151         u'prod': u'∏',
152         u'infin': u'∞',
153         u'radic': u'√',
154         u'there4': u'∴',
155         u'int': u'∫',
156         u'asymp': u'≈',
157         u'ne': u'≠',
158         u'equiv': '≡',
159         u'le': u'≤',
160         u'ge': u'≥',
161         u'loz': u'⋄',
162         u'sum': u'∑',
163         u'part': u'∂',
164         u'prime': u'′',
165         u'Prime': u'″',
166         u'harr': u'↔',
167         u'micro': u'µ',
168         u'not': u'¬',
169         u'plusmn': u'±',
170         u'divide': u'÷',
171         u'cent': u'¢',
172         u'euro': u'€',
173         }
174
175     blockleveltags = [
176         u'h1',
177         u'h2',
178         u'h3',
179         u'h4',
180         u'h5',
181         u'h6',
182         u'pre',
183         u'p',
184         u'ul',
185         u'ol',
186         u'dl',
187         u'li',
188         u'dt',
189         u'dd',
190         u'div',
191         #u'blockquote',
192         ]
193
194     liststarttags = [
195         u'ul',
196         u'ol',
197         u'dl',
198         ]
199
200     cancontainflow = [
201         u'div',
202         u'li',
203         u'dd',
204         u'blockquote',
205     ]
206
207     def __init__(self,textwidth=70):
208         self.text = u''
209         self.curdata = u''
210         self.textwidth = textwidth
211         self.opentags = []
212         self.indentlevel = 0
213         self.ignorenodata = False
214         self.listcount = []
215         self.urls = []
216         self.images = {}
217         HTMLParser.__init__(self)
218
219     def handle_starttag(self, tag, attrs):
220         tag_name = tag.lower()
221         if tag_name in self.blockleveltags:
222             # handle starting a new block - unless we're in a block element
223             # that can contain other blocks, we'll assume that we want to close
224             # the container
225             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226                 self.handle_curdata()
227
228             if tag_name == u'ol':
229                 self.handle_curdata()
230                 self.listcount.append(1)
231                 self.listlevel = len(self.listcount) - 1
232
233             if tag_name == u'dl':
234                 self.indentlevel = self.indentlevel + 4
235
236             if tag_name in self.liststarttags:
237                 smallist = self.opentags[-3:-1]
238                 smallist.reverse()
239                 for prev_listtag in smallist:
240                     if prev_listtag in [u'dl', u'ol']:
241                         self.indentlevel = self.indentlevel + 4
242                         break
243                     elif prev_listtag == u'ul':
244                         self.indentlevel = self.indentlevel + 3
245                         break
246
247             if len(self.opentags) > 0:
248                 self.handle_curdata()
249                 if tag_name not in self.cancontainflow:
250                     self.opentags.pop()
251             self.opentags.append(tag_name)
252         else:
253             if tag_name == "span":
254                 return
255             listcount = 0
256             try:
257                 listcount = self.listcount[-1]
258             except:
259                 pass
260
261             if tag_name == u'dd' and len(self.opentags) > 1 \
262                 and self.opentags[-1] == u'dt':
263                 self.handle_curdata()
264                 self.opentags.pop()
265             elif tag_name == u'dt' and len(self.opentags) > 1 \
266                 and self.opentags[-1] == u'dd':
267                 self.handle_curdata()
268                 self.opentags.pop()
269             elif tag_name == u'a':
270                 for attr in attrs:
271                     if attr[0].lower() == u'href':
272                         self.urls.append(attr[1].decode('utf-8'))
273                 self.curdata = self.curdata + u'`'
274                 self.opentags.append(tag_name)
275                 return
276             elif tag_name == u'img':
277                 self.handle_image(attrs)
278                 return
279             elif tag_name == u'br':
280                 self.handle_br()
281                 return
282             else:
283                 # we don't know the tag, so lets avoid handling it!
284                 return 
285
286     def handle_startendtag(self, tag, attrs):
287         if tag.lower() == u'br':
288             self.handle_br()
289         elif tag.lower() == u'img':
290             self.handle_image(attrs)
291             return
292
293     def handle_br(self):
294             self.handle_curdata()
295             self.opentags.append(u'br')
296             self.handle_curdata()
297             self.opentags.pop()
298
299     def handle_image(self, attrs):
300         alt = u''
301         url = u''
302         for attr in attrs:
303             if attr[0] == 'alt':
304                 alt = attr[1].decode('utf-8')
305             elif attr[0] == 'src':
306                 url = attr[1].decode('utf-8')
307         if url:
308             if alt:
309                 if self.images.has_key(alt):
310                     if self.images[alt]["url"] == url:
311                         self.curdata = self.curdata \
312                             + u'|%s|' %(alt,)
313                     else:
314                         while self.images.has_key(alt):
315                             alt = alt + "_"
316                         self.images[alt]["url"] = url
317                         self.curdata = self.curdata \
318                             + u'|%s|' %(alt,)
319                 else:
320                     self.images[alt] = {}
321                     self.images[alt]["url"] = url
322                     self.curdata = self.curdata \
323                         + u'|%s|' %(alt,)
324             else:
325                 if self.images.has_key(url):
326                     self.curdata = self.curdata \
327                         + u'|%s|' %(url,)
328                 else:
329                     self.images[url] = {}
330                     self.images[url]["url"] =url
331                     self.curdata = self.curdata \
332                         + u'|%s|' %(url,)
333
334     def handle_curdata(self):
335
336         if len(self.opentags) == 0:
337             return
338
339         tag_thats_done = self.opentags[-1]
340
341         if len(self.curdata) == 0:
342             return
343
344         if tag_thats_done == u'br':
345             if len(self.text) == 0 or self.text[-1] != '\n':
346                 self.text = self.text + '\n'
347                 self.ignorenodata = True
348             return
349
350         if len(self.curdata.strip()) == 0:
351             return
352
353         if tag_thats_done in self.blockleveltags:
354             newlinerequired = self.text != u''
355             if self.ignorenodata:
356                 newlinerequired = False
357             self.ignorenodata = False
358             if newlinerequired:
359                 if tag_thats_done in [u'dt', u'dd', u'li'] \
360                     and len(self.text) > 1 \
361                     and self.text[-1] != u'\n':
362                         self.text = self.text + u'\n'
363                 elif len(self.text) > 2 \
364                     and self.text[-1] != u'\n' \
365                     and self.text[-2] != u'\n':
366                     self.text = self.text + u'\n\n'
367
368         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
369             underline = u''
370             underlinechar = u'='
371             headingtext = " ".join(self.curdata.split())
372             seperator = u'\n' + u' '*self.indentlevel
373             headingtext = seperator.join( \
374                 textwrap.wrap( \
375                     headingtext, \
376                     self.textwidth - self.indentlevel \
377                     ) \
378                 )
379
380             if tag_thats_done == u'h2':
381                 underlinechar = u'-'
382             elif tag_thats_done != u'h1':
383                 underlinechar = u'~'
384
385             if u'\n' in headingtext:
386                 underline = u' ' * self.indentlevel \
387                     + underlinechar * (self.textwidth - self.indentlevel)
388             else:
389                 underline = u' ' * self.indentlevel \
390                     + underlinechar * len(headingtext)
391             self.text = self.text \
392                 + headingtext + u'\n' \
393                 + underline
394         elif tag_thats_done in [u'p', u'div']:
395             paragraph = unicode( \
396                 " ".join(self.curdata.strip().encode("utf-8").split()), \
397                 "utf-8")
398             seperator = u'\n' + u' ' * self.indentlevel
399             self.text = self.text \
400                 + u' ' * self.indentlevel \
401                 + seperator.join( \
402                     textwrap.wrap( \
403                         paragraph, self.textwidth - self.indentlevel))
404         elif tag_thats_done == "pre":
405             self.text = self.text + unicode( \
406                 self.curdata.encode("utf-8"), "utf-8")
407         elif tag_thats_done == u'blockquote':
408             quote = unicode( \
409                 " ".join(self.curdata.encode("utf-8").strip().split()), \
410                 "utf-8")
411             seperator = u'\n' + u' ' * self.indentlevel + u'> '
412             if len(self.text) > 0 and self.text[-1] != u'\n':
413                 self.text = self.text + u'\n'
414             self.text = self.text \
415                 + u'> ' \
416                 + seperator.join( \
417                     textwrap.wrap( \
418                         quote, \
419                         self.textwidth - self.indentlevel - 2 \
420                     )
421                 )
422             self.curdata = u''
423         elif tag_thats_done == "li":
424             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
425             if len(self.text) > 0 and self.text[-1] != u'\n':
426                 self.text = self.text + u'\n'
427             # work out if we're in an ol rather than a ul
428             latesttags = self.opentags[-4:]
429             latesttags.reverse()
430             isul = None
431             for thing in latesttags:
432                 if thing == 'ul':
433                     isul = True
434                     break
435                 elif thing == 'ol':
436                     isul = False
437                     break
438
439             listindent = 3
440             if not isul:
441                 listindent = 4
442
443             listmarker = u' * '
444             if isul == False:
445                 listmarker = u' %2d. ' %(self.listcount[-1])
446                 self.listcount[-1] = self.listcount[-1] + 1
447
448             seperator = u'\n' \
449                 + u' ' * self.indentlevel \
450                 + u' ' * listindent
451             self.text = self.text \
452                 + u' ' * self.indentlevel \
453                 + listmarker \
454                 + seperator.join( \
455                     textwrap.wrap( \
456                         item, \
457                         self.textwidth - self.indentlevel - listindent \
458                     ) \
459                 )
460             self.curdata = u''
461         elif tag_thats_done == u'dt':
462             definition = unicode(" ".join( \
463                     self.curdata.encode("utf-8").strip().split()), \
464                 "utf-8")
465             if len(self.text) > 0 and self.text[-1] != u'\n':
466                 self.text = self.text + u'\n\n'
467             elif len(self.text) > 1 and self.text[-2] != u'\n':
468                 self.text = self.text + u'\n'
469             definition = u' ' * (self.indentlevel - 4) + definition + "::"
470             indentstring = u'\n' + u' ' * (self.indentlevel - 3)
471             self.text = self.text \
472                 + indentstring.join(
473                     textwrap.wrap(definition, \
474                         self.textwidth - self.indentlevel - 4))
475             self.curdata = u''
476         elif tag_thats_done == u'dd':
477             definition = unicode(" ".join( \
478                     self.curdata.encode("utf-8").strip().split()),
479                 "utf-8")
480             if len(definition) > 0:
481                 if len(self.text) > 0 and self.text[-1] != u'\n':
482                     self.text = self.text + u'\n'
483                 indentstring = u'\n' + u' ' * self.indentlevel
484                 self.text = self.text \
485                     + indentstring \
486                     + indentstring.join( \
487                         textwrap.wrap( \
488                             definition, \
489                             self.textwidth - self.indentlevel \
490                             ) \
491                         )
492                 self.curdata = u''
493         elif tag_thats_done == u'a':
494             self.curdata = self.curdata + u'`__'
495             pass
496         elif tag_thats_done in self.liststarttags:
497             pass
498
499         if tag_thats_done in self.blockleveltags:
500             self.curdata = u''
501
502         self.ignorenodata = False
503
504     def handle_endtag(self, tag):
505         self.ignorenodata = False
506         if tag == "span":
507             return
508
509         try:
510             tagindex = self.opentags.index(tag)
511         except:
512             return
513         tag = tag.lower()
514
515         if tag in [u'br', u'img']:
516             return
517
518         if tag == u'dl':
519             self.indentlevel = self.indentlevel - 4
520
521         if tag in self.liststarttags:
522             if tag in [u'ol', u'dl', u'ul', u'dd']:
523                 self.handle_curdata()
524                 # find if there was a previous list level
525                 smalllist = self.opentags[:-1]
526                 smalllist.reverse()
527                 for prev_listtag in smalllist:
528                     if prev_listtag in [u'ol', u'dl']:
529                         self.indentlevel = self.indentlevel - 4
530                         break
531                     elif prev_listtag == u'ul':
532                         self.indentlevel = self.indentlevel - 3
533                         break
534
535         if tag == u'ol':
536             self.listcount = self.listcount[:-1]
537
538         while tagindex < len(self.opentags) \
539             and tag in self.opentags[tagindex+1:]:
540             try:
541                 tagindex = self.opentags.index(tag, tagindex+1)
542             except:
543                 # well, we don't want to do that then
544                 pass
545         if tagindex != len(self.opentags) - 1:
546             # Assuming the data was for the last opened tag first
547             self.handle_curdata()
548             # Now kill the list to be a slice before this tag was opened
549             self.opentags = self.opentags[:tagindex + 1]
550         else:
551             self.handle_curdata()
552             if self.opentags[-1] == tag:
553                 self.opentags.pop()
554
555     def handle_data(self, data):
556         if len(self.opentags) == 0:
557             self.opentags.append(u'p')
558         self.curdata = self.curdata + data.decode("utf-8")
559
560     def handle_entityref(self, name):
561         entity = name
562         if HTML2Text.entities.has_key(name):
563             entity = HTML2Text.entities[name]
564         elif name[0] == "#":
565             entity = unichr(int(name[1:]))
566         else:
567             entity = "&" + name + ";"
568
569         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
570             "utf-8")
571
572     def gettext(self):
573         self.handle_curdata()
574         if len(self.text) == 0 or self.text[-1] != u'\n':
575             self.text = self.text + u'\n'
576         self.opentags = []
577         if len(self.text) > 0:
578             while len(self.text) > 1 and self.text[-1] == u'\n':
579                 self.text = self.text[:-1]
580             self.text = self.text + u'\n'
581         if len(self.urls) > 0:
582             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
583             self.urls = []
584         if len(self.images.keys()) > 0:
585             self.text = self.text + u'\n.. ' \
586                 + u'\n.. '.join( \
587                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
588                 for a in self.images.keys()]) + u'\n'
589             self.images = {}
590         return self.text
591
592 def open_url(method, url):
593     redirectcount = 0
594     while redirectcount < 3:
595         (type, rest) = urllib.splittype(url)
596         (host, path) = urllib.splithost(rest)
597         (host, port) = urllib.splitport(host)
598         if port == None:
599             port = 80
600         try:
601             conn = httplib.HTTPConnection("%s:%s" %(host, port))
602             conn.request(method, path)
603             response = conn.getresponse()
604             if response.status in [301, 302, 303, 307]:
605                 headers = response.getheaders()
606                 for header in headers:
607                     if header[0] == "location":
608                         url = header[1]
609             elif response.status == 200:
610                 return response
611         except:
612             pass
613         redirectcount = redirectcount + 1
614     return None
615
616 def parse_and_deliver(maildir, url, statedir):
617     feedhandle = None
618     headers = None
619     # first check if we know about this feed already
620     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
621     if feeddb.has_key(url):
622         data = feeddb[url]
623         data = cgi.parse_qs(data)
624         response = open_url("HEAD", url)
625         headers = None
626         if response:
627             headers = response.getheaders()
628         ischanged = False
629         try:
630             for header in headers:
631                 if header[0] == "content-length":
632                     if header[1] != data["content-length"][0]:
633                         ischanged = True
634                 elif header[0] == "etag":
635                     if header[1] != data["etag"][0]:
636                         ischanged = True
637                 elif header[0] == "last-modified":
638                     if header[1] != data["last-modified"][0]:
639                         ischanged = True
640                 elif header[0] == "content-md5":
641                     if header[1] != data["content-md5"][0]:
642                         ischanged = True
643         except:
644             ischanged = True
645         if ischanged:
646             response = open_url("GET", url)
647             if response != None:
648                 headers = response.getheaders()
649                 feedhandle = response
650             else:
651                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
652                 return
653         else:
654             return # don't need to do anything, nothings changed.
655     else:
656         response = open_url("GET", url)
657         if response != None:
658             headers = response.getheaders()
659             feedhandle = response
660         else:
661             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
662             return
663
664     fp = feedparser.parse(feedhandle)
665     db = dbm.open(os.path.join(statedir, "seen"), "c")
666     for item in fp["items"]:
667         # have we seen it before?
668         # need to work out what the content is first...
669
670         if item.has_key("content"):
671             content = item["content"][0]["value"]
672         else:
673             content = item["summary"]
674
675         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
676
677         prevmessageid = None
678
679         # check if there's a guid too - if that exists and we match the md5,
680         # return
681         if item.has_key("guid"):
682             if db.has_key(url + "|" + item["guid"]):
683                 data = db[url + "|" + item["guid"]]
684                 data = cgi.parse_qs(data)
685                 if data["contentmd5"][0] == md5sum:
686                     continue
687
688         if db.has_key(url + "|" + item["link"]):
689             data = db[url + "|" + item["link"]]
690             data = cgi.parse_qs(data)
691             if data.has_key("message-id"):
692                 prevmessageid = data["message-id"][0]
693             if data["contentmd5"][0] == md5sum:
694                 continue
695
696         try:
697             author = item["author"]
698         except:
699             author = url
700
701         # create a basic email message
702         msg = MIMEMultipart("alternative")
703         messageid = "<" \
704             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
705             + "." \
706             + "".join( \
707                 [random.choice( \
708                     string.ascii_letters + string.digits \
709                     ) for a in range(0,6) \
710                 ]) + "@" + socket.gethostname() + ">"
711         msg.add_header("Message-ID", messageid)
712         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
713         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
714         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
715         if prevmessageid:
716             msg.add_header("References", prevmessageid)
717         createddate = datetime.datetime.now() \
718             .strftime("%a, %e %b %Y %T -0000")
719         try:
720             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
721                 .strftime("%a, %e %b %Y %T -0000")
722         except:
723             pass
724         msg.add_header("Date", createddate)
725         subj_gen = HTML2Text()
726         subj_gen.feed(item["title"].encode("utf-8"))
727         msg.add_header("Subject", subj_gen.gettext())
728         msg.set_default_type("text/plain")
729
730         htmlcontent = content.encode("utf-8")
731         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
732             content, \
733             item["link"], \
734             item["link"] )
735         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
736         textparser = HTML2Text()
737         textparser.feed(content.encode("utf-8"))
738         textcontent = textparser.gettext()
739         textcontent = "%s\n\nItem URL: %s" %( \
740             textcontent, \
741             item["link"] )
742         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
743         msg.attach(textpart)
744         msg.attach(htmlpart)
745
746         # start by working out the filename we should be writting to, we do
747         # this following the normal maildir style rules
748         fname = str(os.getpid()) \
749             + "." + socket.gethostname() \
750             + "." + "".join( \
751                 [random.choice( \
752                     string.ascii_letters + string.digits \
753                     ) for a in range(0,10) \
754                 ]) + "." \
755             + datetime.datetime.now().strftime('%s')
756         fn = os.path.join(maildir, "tmp", fname)
757         fh = open(fn, "w")
758         fh.write(msg.as_string())
759         fh.close()
760         # now move it in to the new directory
761         newfn = os.path.join(maildir, "new", fname)
762         os.link(fn, newfn)
763         os.unlink(fn)
764
765         # now add to the database about the item
766         if prevmessageid:
767             messageid = prevmessageid + " " + messageid
768         if item.has_key("guid") and item["guid"] != item["link"]:
769             data = urllib.urlencode(( \
770                 ("message-id", messageid), \
771                 ("created", createddate), \
772                 ("contentmd5", md5sum) \
773                 ))
774             db[url + "|" + item["guid"]] = data
775             try:
776                 data = db[url + "|" + item["link"]]
777                 data = cgi.parse_qs(data)
778                 newdata = urllib.urlencode(( \
779                     ("message-id", messageid), \
780                     ("created", data["created"][0]), \
781                     ("contentmd5", data["contentmd5"][0]) \
782                     ))
783                 db[url + "|" + item["link"]] = newdata
784             except:
785                 db[url + "|" + item["link"]] = data
786         else:
787             data = urllib.urlencode(( \
788                 ("message-id", messageid), \
789                 ("created", createddate), \
790                 ("contentmd5", md5sum) \
791                 ))
792             db[url + "|" + item["link"]] = data
793
794     if headers:
795         data = []
796         for header in headers:
797             if header[0] in \
798                 ["content-md5", "etag", "last-modified", "content-length"]:
799                 data.append((header[0], header[1]))
800         if len(data) > 0:
801             data = urllib.urlencode(data)
802             feeddb[url] = data
803
804     db.close()
805     feeddb.close()
806
807 if __name__ == "__main__":
808     # This only gets executed if we really called the program
809     # first off, parse the command line arguments
810
811     oparser = OptionParser()
812     oparser.add_option(
813         "-c", "--conf", dest="conf",
814         help="location of config file"
815         )
816     oparser.add_option(
817         "-s", "--statedir", dest="statedir",
818         help="location of directory to store state in"
819         )
820
821     (options, args) = oparser.parse_args()
822
823     # check for the configfile
824
825     configfile = None
826
827     if options.conf != None:
828         # does the file exist?
829         try:
830             os.stat(options.conf)
831             configfile = options.conf
832         except:
833             # should exit here as the specified file doesn't exist
834             sys.stderr.write( \
835                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
836             sys.exit(2)
837     else:
838         # check through the default locations
839         try:
840             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
841             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
842         except:
843             try:
844                 os.stat("/etc/rss2maildir.conf")
845                 configfile = "/etc/rss2maildir.conf"
846             except:
847                 sys.stderr.write("No config file found. Exiting.\n")
848                 sys.exit(2)
849
850     # Right - if we've got this far, we've got a config file, now for the hard
851     # bits...
852
853     scp = SafeConfigParser()
854     scp.read(configfile)
855
856     maildir_root = "RSSMaildir"
857     state_dir = "state"
858
859     if options.statedir != None:
860         state_dir = options.statedir
861         try:
862             mode = os.stat(state_dir)[stat.ST_MODE]
863             if not stat.S_ISDIR(mode):
864                 sys.stderr.write( \
865                     "State directory (%s) is not a directory\n" %(state_dir))
866                 sys.exit(1)
867         except:
868             # try to make the directory
869             try:
870                 os.mkdir(state_dir)
871             except:
872                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
873                 sys.exit(1)
874     elif scp.has_option("general", "state_dir"):
875         new_state_dir = scp.get("general", "state_dir")
876         try:
877             mode = os.stat(new_state_dir)[stat.ST_MODE]
878             if not stat.S_ISDIR(mode):
879                 sys.stderr.write( \
880                     "State directory (%s) is not a directory\n" %(state_dir))
881                 sys.exit(1)
882             else:
883                 state_dir = new_state_dir
884         except:
885             # try to create it
886             try:
887                 os.mkdir(new_state_dir)
888                 state_dir = new_state_dir
889             except:
890                 sys.stderr.write( \
891                     "Couldn't create state directory %s\n" %(new_state_dir))
892                 sys.exit(1)
893     else:
894         try:
895             mode = os.stat(state_dir)[stat.ST_MODE]
896             if not stat.S_ISDIR(mode):
897                 sys.stderr.write( \
898                     "State directory %s is not a directory\n" %(state_dir))
899                 sys.exit(1)
900         except:
901             try:
902                 os.mkdir(state_dir)
903             except:
904                 sys.stderr.write( \
905                     "State directory %s could not be created\n" %(state_dir))
906                 sys.exit(1)
907
908     if scp.has_option("general", "maildir_root"):
909         maildir_root = scp.get("general", "maildir_root")
910
911     try:
912         mode = os.stat(maildir_root)[stat.ST_MODE]
913         if not stat.S_ISDIR(mode):
914             sys.stderr.write( \
915                 "Maildir Root %s is not a directory\n" \
916                 %(maildir_root))
917             sys.exit(1)
918     except:
919         try:
920             os.mkdir(maildir_root)
921         except:
922             sys.stderr.write("Couldn't create Maildir Root %s\n" \
923                 %(maildir_root))
924             sys.exit(1)
925
926     feeds = scp.sections()
927     try:
928         feeds.remove("general")
929     except:
930         pass
931
932     for section in feeds:
933         # check if the directory exists
934         maildir = None
935         try:
936             maildir = scp.get(section, "maildir")
937         except:
938             maildir = section
939
940         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
941         maildir = os.path.join(maildir_root, maildir)
942
943         try:
944             exists = os.stat(maildir)
945             if stat.S_ISDIR(exists[stat.ST_MODE]):
946                 # check if there's a new, cur and tmp directory
947                 try:
948                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
949                 except:
950                     os.mkdir(os.path.join(maildir, "cur"))
951                     if not stat.S_ISDIR(mode):
952                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
953                 try:
954                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
955                 except:
956                     os.mkdir(os.path.join(maildir, "tmp"))
957                     if not stat.S_ISDIR(mode):
958                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
959                 try:
960                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
961                     if not stat.S_ISDIR(mode):
962                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
963                 except:
964                     os.mkdir(os.path.join(maildir, "new"))
965             else:
966                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
967         except:
968             try:
969                 os.mkdir(maildir)
970             except:
971                 sys.stderr.write("Couldn't create root maildir %s\n" \
972                     %(maildir))
973                 sys.exit(1)
974             try:
975                 os.mkdir(os.path.join(maildir, "new"))
976                 os.mkdir(os.path.join(maildir, "cur"))
977                 os.mkdir(os.path.join(maildir, "tmp"))
978             except:
979                 sys.stderr.write( \
980                     "Couldn't create required maildir directories for %s\n" \
981                     %(section,))
982                 sys.exit(1)
983
984         # right - we've got the directories, we've got the section, we know the
985         # url... lets play!
986
987         parse_and_deliver(maildir, section, state_dir)