c9a2cee8418f92202f833ec270748afb5dafdbad
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 entities = {
50     "amp": "&",
51     "lt": "<",
52     "gt": ">",
53     "pound": "£",
54     "copy": "©",
55     "apos": "'",
56     "quot": "\"",
57     "nbsp": " ",
58     }
59
60 class HTML2Text(HTMLParser):
61
62     def __init__(self):
63         self.inheadingone = False
64         self.inheadingtwo = False
65         self.inotherheading = False
66         self.inparagraph = True
67         self.inblockquote = False
68         self.inlink = False
69         self.text = u''
70         self.currentparagraph = u''
71         self.headingtext = u''
72         self.blockquote = u''
73         self.inpre = False
74         self.inul = False
75         self.initem = False
76         self.item = u''
77         HTMLParser.__init__(self)
78
79     def handle_starttag(self, tag, attrs):
80         if tag.lower() == "h1":
81             self.inheadingone = True
82             self.inparagraph = False
83         elif tag.lower() == "h2":
84             self.inheadingtwo = True
85             self.inparagraph = False
86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87             self.inotherheading = True
88             self.inparagraph = False
89         elif tag.lower() == "a":
90             self.inlink = True
91         elif tag.lower() == "br":
92             self.handle_br()
93         elif tag.lower() == "blockquote":
94             self.inblockquote = True
95             self.text = self.text + u'\n'
96         elif tag.lower() == "p":
97             if self.text != "":
98                 self.text = self.text + u'\n\n'
99             if self.inparagraph:
100                 self.text = self.text \
101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
102             self.currentparagraph = u''
103             self.inparagraph = True
104         elif tag.lower() == "pre":
105             self.text = self.text + "\n"
106             self.inpre = True
107             self.inparagraph = False
108             self.inblockquote = False
109         elif tag.lower() == "ul":
110             self.item = u''
111             self.inul = True
112             self.text = self.text + "\n"
113         elif tag.lower() == "li" and self.inul:
114             if not self.initem:
115                 self.initem = True
116                 self.item = u''
117             else:
118                 self.text = self.text \
119                     + u' * ' \
120                     + u'\n   '.join([a.strip() for a in \
121                         textwrap.wrap(self.item, 67)]) \
122                     + u'\n'
123                 self.item = u''
124
125     def handle_startendtag(self, tag, attrs):
126         if tag.lower() == "br":
127             self.handle_br()
128
129     def handle_br(self):
130             if self.inparagraph:
131                 self.text = self.text \
132                 + u'\n'.join( \
133                     [a \
134                         for a in textwrap.wrap( \
135                             self.currentparagraph, 70) \
136                     ] \
137                 ) \
138                 + u'\n'
139                 self.currentparagraph = u''
140             elif self.inblockquote:
141                 self.text = self.text \
142                     + u'\n> ' \
143                     + u'\n> '.join( \
144                         [a \
145                             for a in textwrap.wrap( \
146                                 self.blockquote.encode("utf-8") \
147                                 , 68) \
148                         ] \
149                     ) \
150                     + u'\n'
151                 self.blockquote = u''
152             else:
153                 self.text = self.text + "\n"
154
155     def handle_endtag(self, tag):
156         if tag.lower() == "h1":
157             self.inheadingone = False
158             self.text = self.text \
159                 + u'\n\n' \
160                 + self.headingtext.encode("utf-8") \
161                 + u'\n' \
162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
163             self.headingtext = u''
164         elif tag.lower() == "h2":
165             self.inheadingtwo = False
166             self.text = self.text \
167                 + u'\n\n' \
168                 + self.headingtext.encode("utf-8") \
169                 + u'\n' \
170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
171             self.headingtext = u''
172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173             self.inotherheading = False
174             self.text = self.text \
175                 + u'\n\n' \
176                 + self.headingtext.encode("utf-8") \
177                 + u'\n' \
178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
179             self.headingtext = u''
180         elif tag.lower() == "p":
181             self.text = self.text \
182                 + u'\n'.join(textwrap.wrap( \
183                     self.currentparagraph, 70) \
184                 )
185             self.inparagraph = False
186             self.currentparagraph = u''
187         elif tag.lower() == "blockquote":
188             self.text = self.text \
189                 + u'\n> ' \
190                 + u'\n> '.join( \
191                     [a.strip() \
192                         for a in textwrap.wrap( \
193                             self.blockquote, 68)] \
194                     ) \
195                 + u'\n'
196             self.inblockquote = False
197             self.blockquote = u''
198         elif tag.lower() == "pre":
199             self.inpre = False
200         elif tag.lower() == "li":
201             self.initem = False
202             if self.item != "":
203                 self.text = self.text \
204                     + u' * ' \
205                     + u'\n   '.join( \
206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
207                     + u'\n'
208             self.item = u''
209         elif tag.lower() == "ul":
210             self.inul = False
211
212     def handle_data(self, data):
213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
214             self.headingtext = self.headingtext \
215                 + unicode(data, "utf-8").strip() \
216                 + u' '
217         elif self.inblockquote:
218             self.blockquote = self.blockquote \
219                 + unicode(data, "utf-8").strip() \
220                 + u' '
221         elif self.inparagraph:
222             self.currentparagraph = self.currentparagraph \
223                 + unicode(data, "utf-8").strip() \
224                 + u' '
225         elif self.inul and self.initem:
226             self.item = self.item + unicode(data, "utf-8")
227         elif self.inpre:
228             self.text = self.text + unicode(data, "utf-8")
229         else:
230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
231
232     def handle_entityref(self, name):
233         entity = name
234         if entities.has_key(name.lower()):
235             entity = entities[name.lower()]
236         elif name[0] == "#":
237             entity = unichr(int(name[1:]))
238         else:
239             entity = "&" + name + ";"
240
241         if self.inparagraph:
242             self.currentparagraph = self.currentparagraph \
243                 + unicode(entity, "utf-8")
244         elif self.inblockquote:
245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
246         else:
247             self.text = self.text + unicode(entity, "utf-8")
248
249     def gettext(self):
250         data = self.text
251         if self.inparagraph:
252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
253         return data
254
255 def open_url(method, url):
256     redirectcount = 0
257     while redirectcount < 3:
258         (type, rest) = urllib.splittype(url)
259         (host, path) = urllib.splithost(rest)
260         (host, port) = urllib.splitport(host)
261         if port == None:
262             port = 80
263         try:
264             conn = httplib.HTTPConnection("%s:%s" %(host, port))
265             conn.request(method, path)
266             response = conn.getresponse()
267             if response.status in [301, 302, 303, 307]:
268                 headers = response.getheaders()
269                 for header in headers:
270                     if header[0] == "location":
271                         url = header[1]
272             elif response.status == 200:
273                 return response
274         except:
275             pass
276         redirectcount = redirectcount + 1
277     return None
278
279 def parse_and_deliver(maildir, url, statedir):
280     feedhandle = None
281     headers = None
282     # first check if we know about this feed already
283     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
284     if feeddb.has_key(url):
285         data = feeddb[url]
286         data = cgi.parse_qs(data)
287         response = open_url("HEAD", url)
288         headers = None
289         if response:
290             headers = response.getheaders()
291         ischanged = False
292         try:
293             for header in headers:
294                 if header[0] == "content-length":
295                     if header[1] != data["content-length"][0]:
296                         ischanged = True
297                 elif header[0] == "etag":
298                     if header[1] != data["etag"][0]:
299                         ischanged = True
300                 elif header[0] == "last-modified":
301                     if header[1] != data["last-modified"][0]:
302                         ischanged = True
303                 elif header[0] == "content-md5":
304                     if header[1] != data["content-md5"][0]:
305                         ischanged = True
306         except:
307             ischanged = True
308         if ischanged:
309             response = open_url("GET", url)
310             if response != None:
311                 headers = response.getheaders()
312                 feedhandle = response
313             else:
314                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
315                 return
316         else:
317             return # don't need to do anything, nothings changed.
318     else:
319         response = open_url("GET", url)
320         if response != None:
321             headers = response.getheaders()
322             feedhandle = response
323         else:
324             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
325             return
326
327     fp = feedparser.parse(feedhandle)
328     db = dbm.open(os.path.join(statedir, "seen"), "c")
329     for item in fp["items"]:
330         # have we seen it before?
331         # need to work out what the content is first...
332
333         if item.has_key("content"):
334             content = item["content"][0]["value"]
335         else:
336             content = item["summary"]
337
338         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
339
340         prevmessageid = None
341
342         # check if there's a guid too - if that exists and we match the md5,
343         # return
344         if item.has_key("guid"):
345             if db.has_key(url + "|" + item["guid"]):
346                 data = db[url + "|" + item["guid"]]
347                 data = cgi.parse_qs(data)
348                 if data["contentmd5"][0] == md5sum:
349                     continue
350
351         if db.has_key(url + "|" + item["link"]):
352             data = db[url + "|" + item["link"]]
353             data = cgi.parse_qs(data)
354             if data.has_key("message-id"):
355                 prevmessageid = data["message-id"][0]
356             if data["contentmd5"][0] == md5sum:
357                 continue
358
359         try:
360             author = item["author"]
361         except:
362             author = url
363
364         # create a basic email message
365         msg = MIMEMultipart("alternative")
366         messageid = "<" \
367             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
368             + "." \
369             + "".join( \
370                 [random.choice( \
371                     string.ascii_letters + string.digits \
372                     ) for a in range(0,6) \
373                 ]) + "@" + socket.gethostname() + ">"
374         msg.add_header("Message-ID", messageid)
375         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
376         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
377         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
378         if prevmessageid:
379             msg.add_header("References", prevmessageid)
380         createddate = datetime.datetime.now() \
381             .strftime("%a, %e %b %Y %T -0000")
382         try:
383             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
384                 .strftime("%a, %e %b %Y %T -0000")
385         except:
386             pass
387         msg.add_header("Date", createddate)
388         msg.add_header("Subject", item["title"])
389         msg.set_default_type("text/plain")
390
391         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
392         textparser = HTML2Text()
393         textparser.feed(content.encode("utf-8"))
394         textcontent = textparser.gettext()
395         textcontent = "%s\n\nItem URL: %s" %( \
396             textcontent, \
397             item["link"] )
398         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
399         msg.attach(textpart)
400         msg.attach(htmlpart)
401
402         # start by working out the filename we should be writting to, we do
403         # this following the normal maildir style rules
404         fname = str(os.getpid()) \
405             + "." + socket.gethostname() \
406             + "." + "".join( \
407                 [random.choice( \
408                     string.ascii_letters + string.digits \
409                     ) for a in range(0,10) \
410                 ]) + "." \
411             + datetime.datetime.now().strftime('%s')
412         fn = os.path.join(maildir, "tmp", fname)
413         fh = open(fn, "w")
414         fh.write(msg.as_string())
415         fh.close()
416         # now move it in to the new directory
417         newfn = os.path.join(maildir, "new", fname)
418         os.link(fn, newfn)
419         os.unlink(fn)
420
421         # now add to the database about the item
422         if prevmessageid:
423             messageid = prevmessageid + " " + messageid
424         if item.has_key("guid") and item["guid"] != item["link"]:
425             data = urllib.urlencode(( \
426                 ("message-id", messageid), \
427                 ("created", createddate), \
428                 ("contentmd5", md5sum) \
429                 ))
430             db[url + "|" + item["guid"]] = data
431             try:
432                 data = db[url + "|" + item["link"]]
433                 data = cgi.parse_qs(data)
434                 newdata = urllib.urlencode(( \
435                     ("message-id", messageid), \
436                     ("created", data["created"][0]), \
437                     ("contentmd5", data["contentmd5"][0]) \
438                     ))
439                 db[url + "|" + item["link"]] = newdata
440             except:
441                 db[url + "|" + item["link"]] = data
442         else:
443             data = urllib.urlencode(( \
444                 ("message-id", messageid), \
445                 ("created", createddate), \
446                 ("contentmd5", md5sum) \
447                 ))
448             db[url + "|" + item["link"]] = data
449
450     if headers:
451         data = []
452         for header in headers:
453             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
454                 data.append((header[0], header[1]))
455         if len(data) > 0:
456             data = urllib.urlencode(data)
457             feeddb[url] = data
458
459     db.close()
460     feeddb.close()
461
462 # first off, parse the command line arguments
463
464 oparser = OptionParser()
465 oparser.add_option(
466     "-c", "--conf", dest="conf",
467     help="location of config file"
468     )
469 oparser.add_option(
470     "-s", "--statedir", dest="statedir",
471     help="location of directory to store state in"
472     )
473
474 (options, args) = oparser.parse_args()
475
476 # check for the configfile
477
478 configfile = None
479
480 if options.conf != None:
481     # does the file exist?
482     try:
483         os.stat(options.conf)
484         configfile = options.conf
485     except:
486         # should exit here as the specified file doesn't exist
487         sys.stderr.write( \
488             "Config file %s does not exist. Exiting.\n" %(options.conf,))
489         sys.exit(2)
490 else:
491     # check through the default locations
492     try:
493         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
494         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
495     except:
496         try:
497             os.stat("/etc/rss2maildir.conf")
498             configfile = "/etc/rss2maildir.conf"
499         except:
500             sys.stderr.write("No config file found. Exiting.\n")
501             sys.exit(2)
502
503 # Right - if we've got this far, we've got a config file, now for the hard
504 # bits...
505
506 scp = SafeConfigParser()
507 scp.read(configfile)
508
509 maildir_root = "RSSMaildir"
510 state_dir = "state"
511
512 if options.statedir != None:
513     state_dir = options.statedir
514     try:
515         mode = os.stat(state_dir)[stat.ST_MODE]
516         if not stat.S_ISDIR(mode):
517             sys.stderr.write( \
518                 "State directory (%s) is not a directory\n" %(state_dir))
519             sys.exit(1)
520     except:
521         # try to make the directory
522         try:
523             os.mkdir(state_dir)
524         except:
525             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
526             sys.exit(1)
527 elif scp.has_option("general", "state_dir"):
528     new_state_dir = scp.get("general", "state_dir")
529     try:
530         mode = os.stat(state_dir)[stat.ST_MODE]
531         if not stat.S_ISDIR(mode):
532             sys.stderr.write( \
533                 "State directory (%s) is not a directory\n" %(state_dir))
534             sys.exit(1)
535     except:
536         # try to create it
537         try:
538             os.mkdir(new_state_dir)
539             state_dir = new_state_dir
540         except:
541             sys.stderr.write( \
542                 "Couldn't create state directory %s\n" %(new_state_dir))
543             sys.exit(1)
544 else:
545     try:
546         mode = os.stat(state_dir)[stat.ST_MODE]
547         if not stat.S_ISDIR(mode):
548             sys.stderr.write( \
549                 "State directory %s is not a directory\n" %(state_dir))
550             sys.exit(1)
551     except:
552         try:
553             os.mkdir(state_dir)
554         except:
555             sys.stderr.write( \
556                 "State directory %s could not be created\n" %(state_dir))
557             sys.exit(1)
558
559 if scp.has_option("general", "maildir_root"):
560     maildir_root = scp.get("general", "maildir_root")
561
562 try:
563     mode = os.stat(maildir_root)[stat.ST_MODE]
564     if not stat.S_ISDIR(mode):
565         sys.stderr.write( \
566             "Maildir Root %s is not a directory\n" \
567             %(maildir_root))
568         sys.exit(1)
569 except:
570     try:
571         os.mkdir(maildir_root)
572     except:
573         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
574         sys.exit(1)
575
576 feeds = scp.sections()
577 try:
578     feeds.remove("general")
579 except:
580     pass
581
582 for section in feeds:
583     # check if the directory exists
584     maildir = None
585     try:
586         maildir = scp.get(section, "maildir")
587     except:
588         maildir = section
589
590     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
591     maildir = os.path.join(maildir_root, maildir)
592
593     try:
594         exists = os.stat(maildir)
595         if stat.S_ISDIR(exists[stat.ST_MODE]):
596             # check if there's a new, cur and tmp directory
597             try:
598                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
599             except:
600                 os.mkdir(os.path.join(maildir, "cur"))
601                 if not stat.S_ISDIR(mode):
602                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
603             try:
604                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
605             except:
606                 os.mkdir(os.path.join(maildir, "tmp"))
607                 if not stat.S_ISDIR(mode):
608                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
609             try:
610                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
611                 if not stat.S_ISDIR(mode):
612                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
613             except:
614                 os.mkdir(os.path.join(maildir, "new"))
615         else:
616             sys.stderr.write("Broken maildir: %s\n" %(maildir))
617     except:
618         try:
619             os.mkdir(maildir)
620         except:
621             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
622             sys.exit(1)
623         try:
624             os.mkdir(os.path.join(maildir, "new"))
625             os.mkdir(os.path.join(maildir, "cur"))
626             os.mkdir(os.path.join(maildir, "tmp"))
627         except:
628             sys.stderr.write( \
629                 "Couldn't create required maildir directories for %s\n" \
630                 %(section,))
631             sys.exit(1)
632
633     # right - we've got the directories, we've got the section, we know the
634     # url... lets play!
635
636     parse_and_deliver(maildir, section, state_dir)