* fix README to have a more complete config example
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     def __init__(self,textwidth=70):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         self.textwidth = textwidth
77         HTMLParser.__init__(self)
78
79     def handle_starttag(self, tag, attrs):
80         if tag.lower() == "h1":
81             self.inheadingone = True
82             self.inparagraph = False
83         elif tag.lower() == "h2":
84             self.inheadingtwo = True
85             self.inparagraph = False
86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87             self.inotherheading = True
88             self.inparagraph = False
89         elif tag.lower() == "a":
90             self.inlink = True
91         elif tag.lower() == "br":
92             self.handle_br()
93         elif tag.lower() == "blockquote":
94             self.inblockquote = True
95             self.text = self.text + u'\n'
96         elif tag.lower() == "p":
97             if self.text != "":
98                 self.text = self.text + u'\n\n'
99             if self.inparagraph:
100                 self.text = self.text \
101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, self.textwidth))
102             self.currentparagraph = u''
103             self.inparagraph = True
104         elif tag.lower() == "pre":
105             self.text = self.text + "\n"
106             self.inpre = True
107             self.inparagraph = False
108             self.inblockquote = False
109         elif tag.lower() == "ul":
110             self.item = u''
111             self.inul = True
112             self.text = self.text + "\n"
113         elif tag.lower() == "li":
114             if not self.initem:
115                 self.initem = True
116                 self.item = u''
117             else:
118                 self.text = self.text \
119                     + u' * ' \
120                     + u'\n   '.join([a.strip() for a in \
121                         textwrap.wrap(self.item, self.textwidth - 3)]) \
122                     + u'\n'
123                 self.item = u''
124                 self.initem = True
125
126     def handle_startendtag(self, tag, attrs):
127         if tag.lower() == "br":
128             self.handle_br()
129
130     def handle_br(self):
131             if self.inparagraph:
132                 self.text = self.text \
133                 + u'\n'.join( \
134                     [a \
135                         for a in textwrap.wrap( \
136                             self.currentparagraph, self.textwidth) \
137                     ] \
138                 ) \
139                 + u'\n'
140                 self.currentparagraph = u''
141             elif self.inblockquote:
142                 self.text = self.text \
143                     + u'\n> ' \
144                     + u'\n> '.join( \
145                         [a \
146                             for a in textwrap.wrap( \
147                                 self.blockquote.encode("utf-8") \
148                                 , 68) \
149                         ] \
150                     ) \
151                     + u'\n'
152                 self.blockquote = u''
153             else:
154                 self.text = self.text + "\n"
155
156     def handle_endtag(self, tag):
157         if tag.lower() == "h1":
158             self.inheadingone = False
159             self.text = self.text \
160                 + u'\n\n' \
161                 + self.headingtext.encode("utf-8") \
162                 + u'\n' \
163                 + u'=' * len(self.headingtext.encode("utf-8").strip())
164             self.headingtext = u''
165         elif tag.lower() == "h2":
166             self.inheadingtwo = False
167             self.text = self.text \
168                 + u'\n\n' \
169                 + self.headingtext.encode("utf-8") \
170                 + u'\n' \
171                 + u'-' * len(self.headingtext.encode("utf-8").strip())
172             self.headingtext = u''
173         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
174             self.inotherheading = False
175             self.text = self.text \
176                 + u'\n\n' \
177                 + self.headingtext.encode("utf-8") \
178                 + u'\n' \
179                 + u'~' * len(self.headingtext.encode("utf-8").strip())
180             self.headingtext = u''
181         elif tag.lower() == "p":
182             self.text = self.text \
183                 + u'\n'.join(textwrap.wrap( \
184                     self.currentparagraph, self.textwidth) \
185                 )
186             self.inparagraph = False
187             self.currentparagraph = u''
188         elif tag.lower() == "blockquote":
189             self.text = self.text \
190                 + u'\n> ' \
191                 + u'\n> '.join( \
192                     [a.strip() \
193                         for a in textwrap.wrap( \
194                             self.blockquote, self.textwidth - 2)] \
195                     ) \
196                 + u'\n'
197             self.inblockquote = False
198             self.blockquote = u''
199         elif tag.lower() == "pre":
200             self.inpre = False
201         elif tag.lower() == "li":
202             self.initem = False
203             if self.item != u'':
204                 self.text = self.text \
205                     + u' * ' \
206                     + u'\n   '.join( \
207                         [a.strip() for a in textwrap.wrap(self.item, self.textwidth - 3)]) \
208                     + u'\n'
209             self.item = u''
210         elif tag.lower() == "ul":
211             self.inul = False
212
213     def handle_data(self, data):
214         if self.inheadingone or self.inheadingtwo or self.inotherheading:
215             self.headingtext = self.headingtext \
216                 + unicode(data, "utf-8").strip() \
217                 + u' '
218         elif self.inblockquote:
219             self.blockquote = self.blockquote \
220                 + unicode(data, "utf-8").strip() \
221                 + u' '
222         elif self.initem:
223             self.item = self.item + unicode(data, "utf-8")
224         elif self.inparagraph:
225             self.currentparagraph = self.currentparagraph \
226                 + unicode(data, "utf-8").strip() \
227                 + u' '
228         elif self.inpre:
229             self.text = self.text + unicode(data, "utf-8")
230         else:
231             isallwhitespace = data.strip() == ""
232             if not isallwhitespace:
233                 self.text = self.text + unicode(data, "utf-8").strip() + u' ' 
234
235     def handle_entityref(self, name):
236         entity = name
237         if HTML2Text.entities.has_key(name.lower()):
238             entity = HTML2Text.entities[name.lower()]
239         elif name[0] == "#":
240             entity = unichr(int(name[1:]))
241         else:
242             entity = "&" + name + ";"
243
244         if self.inparagraph:
245             self.currentparagraph = self.currentparagraph \
246                 + unicode(entity, "utf-8")
247         elif self.inblockquote:
248             self.blockquote = self.blockquote + unicode(entity, "utf-8")
249         else:
250             self.text = self.text + unicode(entity, "utf-8")
251
252     def gettext(self):
253         data = self.text
254         if self.inparagraph:
255             data = data + "\n".join(textwrap.wrap(self.currentparagraph, self.textwidth))
256         if data[-1] != '\n':
257             data = data + '\n'
258         return data
259
260 def open_url(method, url):
261     redirectcount = 0
262     while redirectcount < 3:
263         (type, rest) = urllib.splittype(url)
264         (host, path) = urllib.splithost(rest)
265         (host, port) = urllib.splitport(host)
266         if port == None:
267             port = 80
268         try:
269             conn = httplib.HTTPConnection("%s:%s" %(host, port))
270             conn.request(method, path)
271             response = conn.getresponse()
272             if response.status in [301, 302, 303, 307]:
273                 headers = response.getheaders()
274                 for header in headers:
275                     if header[0] == "location":
276                         url = header[1]
277             elif response.status == 200:
278                 return response
279         except:
280             pass
281         redirectcount = redirectcount + 1
282     return None
283
284 def parse_and_deliver(maildir, url, statedir):
285     feedhandle = None
286     headers = None
287     # first check if we know about this feed already
288     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
289     if feeddb.has_key(url):
290         data = feeddb[url]
291         data = cgi.parse_qs(data)
292         response = open_url("HEAD", url)
293         headers = None
294         if response:
295             headers = response.getheaders()
296         ischanged = False
297         try:
298             for header in headers:
299                 if header[0] == "content-length":
300                     if header[1] != data["content-length"][0]:
301                         ischanged = True
302                 elif header[0] == "etag":
303                     if header[1] != data["etag"][0]:
304                         ischanged = True
305                 elif header[0] == "last-modified":
306                     if header[1] != data["last-modified"][0]:
307                         ischanged = True
308                 elif header[0] == "content-md5":
309                     if header[1] != data["content-md5"][0]:
310                         ischanged = True
311         except:
312             ischanged = True
313         if ischanged:
314             response = open_url("GET", url)
315             if response != None:
316                 headers = response.getheaders()
317                 feedhandle = response
318             else:
319                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
320                 return
321         else:
322             return # don't need to do anything, nothings changed.
323     else:
324         response = open_url("GET", url)
325         if response != None:
326             headers = response.getheaders()
327             feedhandle = response
328         else:
329             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
330             return
331
332     fp = feedparser.parse(feedhandle)
333     db = dbm.open(os.path.join(statedir, "seen"), "c")
334     for item in fp["items"]:
335         # have we seen it before?
336         # need to work out what the content is first...
337
338         if item.has_key("content"):
339             content = item["content"][0]["value"]
340         else:
341             content = item["summary"]
342
343         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
344
345         prevmessageid = None
346
347         # check if there's a guid too - if that exists and we match the md5,
348         # return
349         if item.has_key("guid"):
350             if db.has_key(url + "|" + item["guid"]):
351                 data = db[url + "|" + item["guid"]]
352                 data = cgi.parse_qs(data)
353                 if data["contentmd5"][0] == md5sum:
354                     continue
355
356         if db.has_key(url + "|" + item["link"]):
357             data = db[url + "|" + item["link"]]
358             data = cgi.parse_qs(data)
359             if data.has_key("message-id"):
360                 prevmessageid = data["message-id"][0]
361             if data["contentmd5"][0] == md5sum:
362                 continue
363
364         try:
365             author = item["author"]
366         except:
367             author = url
368
369         # create a basic email message
370         msg = MIMEMultipart("alternative")
371         messageid = "<" \
372             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
373             + "." \
374             + "".join( \
375                 [random.choice( \
376                     string.ascii_letters + string.digits \
377                     ) for a in range(0,6) \
378                 ]) + "@" + socket.gethostname() + ">"
379         msg.add_header("Message-ID", messageid)
380         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
381         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
382         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
383         if prevmessageid:
384             msg.add_header("References", prevmessageid)
385         createddate = datetime.datetime.now() \
386             .strftime("%a, %e %b %Y %T -0000")
387         try:
388             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
389                 .strftime("%a, %e %b %Y %T -0000")
390         except:
391             pass
392         msg.add_header("Date", createddate)
393         msg.add_header("Subject", item["title"])
394         msg.set_default_type("text/plain")
395
396         htmlcontent = content.encode("utf-8")
397         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
398             content, \
399             item["link"], \
400             item["link"] )
401         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
402         textparser = HTML2Text()
403         textparser.feed(content.encode("utf-8"))
404         textcontent = textparser.gettext()
405         textcontent = "%s\n\nItem URL: %s" %( \
406             textcontent, \
407             item["link"] )
408         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
409         msg.attach(textpart)
410         msg.attach(htmlpart)
411
412         # start by working out the filename we should be writting to, we do
413         # this following the normal maildir style rules
414         fname = str(os.getpid()) \
415             + "." + socket.gethostname() \
416             + "." + "".join( \
417                 [random.choice( \
418                     string.ascii_letters + string.digits \
419                     ) for a in range(0,10) \
420                 ]) + "." \
421             + datetime.datetime.now().strftime('%s')
422         fn = os.path.join(maildir, "tmp", fname)
423         fh = open(fn, "w")
424         fh.write(msg.as_string())
425         fh.close()
426         # now move it in to the new directory
427         newfn = os.path.join(maildir, "new", fname)
428         os.link(fn, newfn)
429         os.unlink(fn)
430
431         # now add to the database about the item
432         if prevmessageid:
433             messageid = prevmessageid + " " + messageid
434         if item.has_key("guid") and item["guid"] != item["link"]:
435             data = urllib.urlencode(( \
436                 ("message-id", messageid), \
437                 ("created", createddate), \
438                 ("contentmd5", md5sum) \
439                 ))
440             db[url + "|" + item["guid"]] = data
441             try:
442                 data = db[url + "|" + item["link"]]
443                 data = cgi.parse_qs(data)
444                 newdata = urllib.urlencode(( \
445                     ("message-id", messageid), \
446                     ("created", data["created"][0]), \
447                     ("contentmd5", data["contentmd5"][0]) \
448                     ))
449                 db[url + "|" + item["link"]] = newdata
450             except:
451                 db[url + "|" + item["link"]] = data
452         else:
453             data = urllib.urlencode(( \
454                 ("message-id", messageid), \
455                 ("created", createddate), \
456                 ("contentmd5", md5sum) \
457                 ))
458             db[url + "|" + item["link"]] = data
459
460     if headers:
461         data = []
462         for header in headers:
463             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
464                 data.append((header[0], header[1]))
465         if len(data) > 0:
466             data = urllib.urlencode(data)
467             feeddb[url] = data
468
469     db.close()
470     feeddb.close()
471
472 if __name__ == "__main__":
473     # This only gets executed if we really called the program
474     # first off, parse the command line arguments
475
476     oparser = OptionParser()
477     oparser.add_option(
478         "-c", "--conf", dest="conf",
479         help="location of config file"
480         )
481     oparser.add_option(
482         "-s", "--statedir", dest="statedir",
483         help="location of directory to store state in"
484         )
485
486     (options, args) = oparser.parse_args()
487
488     # check for the configfile
489
490     configfile = None
491
492     if options.conf != None:
493         # does the file exist?
494         try:
495             os.stat(options.conf)
496             configfile = options.conf
497         except:
498             # should exit here as the specified file doesn't exist
499             sys.stderr.write( \
500                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
501             sys.exit(2)
502     else:
503         # check through the default locations
504         try:
505             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
506             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
507         except:
508             try:
509                 os.stat("/etc/rss2maildir.conf")
510                 configfile = "/etc/rss2maildir.conf"
511             except:
512                 sys.stderr.write("No config file found. Exiting.\n")
513                 sys.exit(2)
514
515     # Right - if we've got this far, we've got a config file, now for the hard
516     # bits...
517
518     scp = SafeConfigParser()
519     scp.read(configfile)
520
521     maildir_root = "RSSMaildir"
522     state_dir = "state"
523
524     if options.statedir != None:
525         state_dir = options.statedir
526         try:
527             mode = os.stat(state_dir)[stat.ST_MODE]
528             if not stat.S_ISDIR(mode):
529                 sys.stderr.write( \
530                     "State directory (%s) is not a directory\n" %(state_dir))
531                 sys.exit(1)
532         except:
533             # try to make the directory
534             try:
535                 os.mkdir(state_dir)
536             except:
537                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
538                 sys.exit(1)
539     elif scp.has_option("general", "state_dir"):
540         new_state_dir = scp.get("general", "state_dir")
541         try:
542             mode = os.stat(state_dir)[stat.ST_MODE]
543             if not stat.S_ISDIR(mode):
544                 sys.stderr.write( \
545                     "State directory (%s) is not a directory\n" %(state_dir))
546                 sys.exit(1)
547         except:
548             # try to create it
549             try:
550                 os.mkdir(new_state_dir)
551                 state_dir = new_state_dir
552             except:
553                 sys.stderr.write( \
554                     "Couldn't create state directory %s\n" %(new_state_dir))
555                 sys.exit(1)
556     else:
557         try:
558             mode = os.stat(state_dir)[stat.ST_MODE]
559             if not stat.S_ISDIR(mode):
560                 sys.stderr.write( \
561                     "State directory %s is not a directory\n" %(state_dir))
562                 sys.exit(1)
563         except:
564             try:
565                 os.mkdir(state_dir)
566             except:
567                 sys.stderr.write( \
568                     "State directory %s could not be created\n" %(state_dir))
569                 sys.exit(1)
570
571     if scp.has_option("general", "maildir_root"):
572         maildir_root = scp.get("general", "maildir_root")
573
574     try:
575         mode = os.stat(maildir_root)[stat.ST_MODE]
576         if not stat.S_ISDIR(mode):
577             sys.stderr.write( \
578                 "Maildir Root %s is not a directory\n" \
579                 %(maildir_root))
580             sys.exit(1)
581     except:
582         try:
583             os.mkdir(maildir_root)
584         except:
585             sys.stderr.write("Couldn't create Maildir Root %s\n" \
586                 %(maildir_root))
587             sys.exit(1)
588
589     feeds = scp.sections()
590     try:
591         feeds.remove("general")
592     except:
593         pass
594
595     for section in feeds:
596         # check if the directory exists
597         maildir = None
598         try:
599             maildir = scp.get(section, "maildir")
600         except:
601             maildir = section
602
603         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
604         maildir = os.path.join(maildir_root, maildir)
605
606         try:
607             exists = os.stat(maildir)
608             if stat.S_ISDIR(exists[stat.ST_MODE]):
609                 # check if there's a new, cur and tmp directory
610                 try:
611                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
612                 except:
613                     os.mkdir(os.path.join(maildir, "cur"))
614                     if not stat.S_ISDIR(mode):
615                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
616                 try:
617                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
618                 except:
619                     os.mkdir(os.path.join(maildir, "tmp"))
620                     if not stat.S_ISDIR(mode):
621                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
622                 try:
623                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
624                     if not stat.S_ISDIR(mode):
625                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
626                 except:
627                     os.mkdir(os.path.join(maildir, "new"))
628             else:
629                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
630         except:
631             try:
632                 os.mkdir(maildir)
633             except:
634                 sys.stderr.write("Couldn't create root maildir %s\n" \
635                     %(maildir))
636                 sys.exit(1)
637             try:
638                 os.mkdir(os.path.join(maildir, "new"))
639                 os.mkdir(os.path.join(maildir, "cur"))
640                 os.mkdir(os.path.join(maildir, "tmp"))
641             except:
642                 sys.stderr.write( \
643                     "Couldn't create required maildir directories for %s\n" \
644                     %(section,))
645                 sys.exit(1)
646
647         # right - we've got the directories, we've got the section, we know the
648         # url... lets play!
649
650         parse_and_deliver(maildir, section, state_dir)