Only download feeds that have changed (or that don't give us enough data to
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 entities = {
50     "amp": "&",
51     "lt": "<",
52     "gt": ">",
53     "pound": "£",
54     "copy": "©",
55     "apos": "'",
56     "quote": "\"",
57     "nbsp": " ",
58     }
59
60 class HTML2Text(HTMLParser):
61
62     def __init__(self):
63         self.inheadingone = False
64         self.inheadingtwo = False
65         self.inotherheading = False
66         self.inparagraph = True
67         self.inblockquote = False
68         self.inlink = False
69         self.text = u''
70         self.currentparagraph = u''
71         self.headingtext = u''
72         self.blockquote = u''
73         self.inpre = False
74         self.inul = False
75         self.initem = False
76         self.item = u''
77         HTMLParser.__init__(self)
78
79     def handle_starttag(self, tag, attrs):
80         if tag.lower() == "h1":
81             self.inheadingone = True
82             self.inparagraph = False
83         elif tag.lower() == "h2":
84             self.inheadingtwo = True
85             self.inparagraph = False
86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87             self.inotherheading = True
88             self.inparagraph = False
89         elif tag.lower() == "a":
90             self.inlink = True
91         elif tag.lower() == "br":
92             self.handle_br()
93         elif tag.lower() == "blockquote":
94             self.inblockquote = True
95             self.text = self.text + u'\n'
96         elif tag.lower() == "p":
97             if self.text != "":
98                 self.text = self.text + u'\n\n'
99             if self.inparagraph:
100                 self.text = self.text \
101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
102             self.currentparagraph = u''
103             self.inparagraph = True
104         elif tag.lower() == "pre":
105             self.text = self.text + "\n"
106             self.inpre = True
107             self.inparagraph = False
108             self.inblockquote = False
109         elif tag.lower() == "ul":
110             self.item = u''
111             self.inul = True
112             self.text = self.text + "\n"
113         elif tag.lower() == "li" and self.inul:
114             if not self.initem:
115                 self.initem = True
116                 self.item = u''
117             else:
118                 self.text = self.text \
119                     + u' * ' \
120                     + u'\n   '.join([a.strip() for a in \
121                         textwrap.wrap(self.item, 67)]) \
122                     + u'\n'
123                 self.item = u''
124
125     def handle_startendtag(self, tag, attrs):
126         if tag.lower() == "br":
127             self.handle_br()
128
129     def handle_br(self):
130             if self.inparagraph:
131                 self.text = self.text \
132                 + u'\n'.join( \
133                     [a \
134                         for a in textwrap.wrap( \
135                             self.currentparagraph, 70) \
136                     ] \
137                 ) \
138                 + u'\n'
139                 self.currentparagraph = u''
140             elif self.inblockquote:
141                 self.text = self.text \
142                     + u'\n> ' \
143                     + u'\n> '.join( \
144                         [a \
145                             for a in textwrap.wrap( \
146                                 self.blockquote.encode("utf-8") \
147                                 , 68) \
148                         ] \
149                     ) \
150                     + u'\n'
151                 self.blockquote = u''
152             else:
153                 self.text = self.text + "\n"
154
155     def handle_endtag(self, tag):
156         if tag.lower() == "h1":
157             self.inheadingone = False
158             self.text = self.text \
159                 + u'\n\n' \
160                 + self.headingtext.encode("utf-8") \
161                 + u'\n' \
162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
163             self.headingtext = u''
164         elif tag.lower() == "h2":
165             self.inheadingtwo = False
166             self.text = self.text \
167                 + u'\n\n' \
168                 + self.headingtext.encode("utf-8") \
169                 + u'\n' \
170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
171             self.headingtext = u''
172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173             self.inotherheading = False
174             self.text = self.text \
175                 + u'\n\n' \
176                 + self.headingtext.encode("utf-8") \
177                 + u'\n' \
178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
179             self.headingtext = u''
180         elif tag.lower() == "p":
181             self.text = self.text \
182                 + u'\n'.join(textwrap.wrap( \
183                     self.currentparagraph, 70) \
184                 )
185             self.inparagraph = False
186             self.currentparagraph = u''
187         elif tag.lower() == "blockquote":
188             self.text = self.text \
189                 + u'\n> ' \
190                 + u'\n> '.join( \
191                     [a.strip() \
192                         for a in textwrap.wrap( \
193                             self.blockquote, 68)] \
194                     ) \
195                 + u'\n'
196             self.inblockquote = False
197             self.blockquote = u''
198         elif tag.lower() == "pre":
199             self.inpre = False
200         elif tag.lower() == "li":
201             self.initem = False
202             if self.item != "":
203                 self.text = self.text \
204                     + u' * ' \
205                     + u'\n   '.join( \
206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
207                     + u'\n'
208             self.item = u''
209         elif tag.lower() == "ul":
210             self.inul = False
211
212     def handle_data(self, data):
213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
214             self.headingtext = self.headingtext \
215                 + unicode(data, "utf-8").strip() \
216                 + u' '
217         elif self.inblockquote:
218             self.blockquote = self.blockquote \
219                 + unicode(data, "utf-8").strip() \
220                 + u' '
221         elif self.inparagraph:
222             self.currentparagraph = self.currentparagraph \
223                 + unicode(data, "utf-8").strip() \
224                 + u' '
225         elif self.inul and self.initem:
226             self.item = self.item + unicode(data, "utf-8")
227         elif self.inpre:
228             self.text = self.text + unicode(data, "utf-8")
229         else:
230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
231
232     def handle_entityref(self, name):
233         entity = name
234         if entities.has_key(name.lower()):
235             entity = entities[name.lower()]
236         elif name[0] == "#":
237             entity = unichr(int(name[1:]))
238         else:
239             entity = "&" + name + ";"
240
241         if self.inparagraph:
242             self.currentparagraph = self.currentparagraph \
243                 + unicode(entity, "utf-8")
244         elif self.inblockquote:
245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
246         else:
247             self.text = self.text + unicode(entity, "utf-8")
248
249     def gettext(self):
250         data = self.text
251         if self.inparagraph:
252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
253         return data
254
255 def parse_and_deliver(maildir, url, statedir):
256     feedhandle = None
257     headers = None
258     # first check if we know about this feed already
259     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
260     # we need all the parts of the url 
261     (type, rest) = urllib.splittype(url)
262     (host, path) = urllib.splithost(rest)
263     (host, port) = urllib.splitport(host)
264     if port == None:
265         port = 80
266     if feeddb.has_key(url):
267         data = feeddb[url]
268         data = cgi.parse_qs(data)
269         # now do a head on the feed to see if it's been updated
270         conn = httplib.HTTPConnection("%s:%s" %(host, port))
271         conn.request("HEAD", path)
272         response = conn.getresponse()
273         headers = response.getheaders()
274         ischanged = False
275         try:
276             for header in headers:
277                 if header[0] == "content-length":
278                     if header[1] != data["content-length"][0]:
279                         ischanged = True
280                 elif header[0] == "etag":
281                     if header[1] != data["etag"][0]:
282                         ischanged = True
283                 elif header[0] == "last-modified":
284                     if header[1] != data["last-modified"][0]:
285                         ischanged = True
286                 elif header[0] == "content-md5":
287                     if header[1] != data["content-md5"][0]:
288                         ischanged = True
289         except:
290             ischanged = True
291         if ischanged:
292             conn = httplib.HTTPConnection("%s:%s" %(host, port))
293             conn.request("GET", path)
294             response = conn.getresponse()
295             headers = response.getheaders()
296             feedhandle = response
297         else:
298             return # don't need to do anything, nothings changed.
299     else:
300         conn = httplib.HTTPConnection("%s:%s" %(host, port))
301         conn.request("GET", path)
302         response = conn.getresponse()
303         headers = response.getheaders()
304         feedhandle = response
305
306     fp = feedparser.parse(feedhandle)
307     db = dbm.open(os.path.join(statedir, "seen"), "c")
308     for item in fp["items"]:
309         # have we seen it before?
310         # need to work out what the content is first...
311
312         if item.has_key("content"):
313             content = item["content"][0]["value"]
314         else:
315             content = item["summary"]
316
317         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
318
319         prevmessageid = None
320
321         if db.has_key(url + "|" + item["link"]):
322             data = db[url + "|" + item["link"]]
323             data = cgi.parse_qs(data)
324             if data.has_key("message-id"):
325                 prevmessageid = data["message-id"][0]
326             if data["contentmd5"][0] == md5sum:
327                 continue
328
329         try:
330             author = item["author"]
331         except:
332             author = url
333
334         # create a basic email message
335         msg = MIMEMultipart("alternative")
336         messageid = "<" \
337             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
338             + "." \
339             + "".join( \
340                 [random.choice( \
341                     string.ascii_letters + string.digits \
342                     ) for a in range(0,6) \
343                 ]) + "@" + socket.gethostname() + ">"
344         msg.add_header("Message-ID", messageid)
345         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
346         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
347         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
348         if prevmessageid:
349             msg.add_header("References", prevmessageid)
350         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
351             .strftime("%a, %e %b %Y %T -0000")
352         msg.add_header("Date", createddate)
353         msg.add_header("Subject", item["title"])
354         msg.set_default_type("text/plain")
355
356         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
357         textparser = HTML2Text()
358         textparser.feed(content.encode("utf-8"))
359         textcontent = textparser.gettext()
360         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
361         msg.attach(textpart)
362         msg.attach(htmlpart)
363
364         # start by working out the filename we should be writting to, we do
365         # this following the normal maildir style rules
366         fname = str(os.getpid()) \
367             + "." + socket.gethostname() \
368             + "." + "".join( \
369                 [random.choice( \
370                     string.ascii_letters + string.digits \
371                     ) for a in range(0,10) \
372                 ]) + "." \
373             + datetime.datetime.now().strftime('%s')
374         fn = os.path.join(maildir, "tmp", fname)
375         fh = open(fn, "w")
376         fh.write(msg.as_string())
377         fh.close()
378         # now move it in to the new directory
379         newfn = os.path.join(maildir, "new", fname)
380         os.link(fn, newfn)
381         os.unlink(fn)
382
383         # now add to the database about the item
384         if prevmessageid:
385             messageid = prevmessageid + " " + messageid
386         data = urllib.urlencode((
387             ("message-id", messageid), \
388             ("created", createddate), \
389             ("contentmd5", md5sum) \
390             ))
391         db[url + "|" + item["link"]] = data
392
393     if headers:
394         data = []
395         for header in headers:
396             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
397                 data.append((header[0], header[1]))
398         if len(data) > 0:
399             data = urllib.urlencode(data)
400             feeddb[url] = data
401
402     db.close()
403     feeddb.close()
404
405 # first off, parse the command line arguments
406
407 oparser = OptionParser()
408 oparser.add_option(
409     "-c", "--conf", dest="conf",
410     help="location of config file"
411     )
412 oparser.add_option(
413     "-s", "--statedir", dest="statedir",
414     help="location of directory to store state in"
415     )
416
417 (options, args) = oparser.parse_args()
418
419 # check for the configfile
420
421 configfile = None
422
423 if options.conf != None:
424     # does the file exist?
425     try:
426         os.stat(options.conf)
427         configfile = options.conf
428     except:
429         # should exit here as the specified file doesn't exist
430         sys.stderr.write( \
431             "Config file %s does not exist. Exiting.\n" %(options.conf,))
432         sys.exit(2)
433 else:
434     # check through the default locations
435     try:
436         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
437         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
438     except:
439         try:
440             os.stat("/etc/rss2maildir.conf")
441             configfile = "/etc/rss2maildir.conf"
442         except:
443             sys.stderr.write("No config file found. Exiting.\n")
444             sys.exit(2)
445
446 # Right - if we've got this far, we've got a config file, now for the hard
447 # bits...
448
449 scp = SafeConfigParser()
450 scp.read(configfile)
451
452 maildir_root = "RSSMaildir"
453 state_dir = "state"
454
455 if options.statedir != None:
456     state_dir = options.statedir
457     try:
458         mode = os.stat(state_dir)[stat.ST_MODE]
459         if not stat.S_ISDIR(mode):
460             sys.stderr.write( \
461                 "State directory (%s) is not a directory\n" %(state_dir))
462             sys.exit(1)
463     except:
464         # try to make the directory
465         try:
466             os.mkdir(state_dir)
467         except:
468             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
469             sys.exit(1)
470 elif scp.has_option("general", "state_dir"):
471     new_state_dir = scp.get("general", "state_dir")
472     try:
473         mode = os.stat(state_dir)[stat.ST_MODE]
474         if not stat.S_ISDIR(mode):
475             sys.stderr.write( \
476                 "State directory (%s) is not a directory\n" %(state_dir))
477             sys.exit(1)
478     except:
479         # try to create it
480         try:
481             os.mkdir(new_state_dir)
482             state_dir = new_state_dir
483         except:
484             sys.stderr.write( \
485                 "Couldn't create state directory %s\n" %(new_state_dir))
486             sys.exit(1)
487 else:
488     try:
489         mode = os.stat(state_dir)[stat.ST_MODE]
490         if not stat.S_ISDIR(mode):
491             sys.stderr.write( \
492                 "State directory %s is not a directory\n" %(state_dir))
493             sys.exit(1)
494     except:
495         try:
496             os.mkdir(state_dir)
497         except:
498             sys.stderr.write( \
499                 "State directory %s could not be created\n" %(state_dir))
500             sys.exit(1)
501
502 if scp.has_option("general", "maildir_root"):
503     maildir_root = scp.get("general", "maildir_root")
504
505 try:
506     mode = os.stat(maildir_root)[stat.ST_MODE]
507     if not stat.S_ISDIR(mode):
508         sys.stderr.write( \
509             "Maildir Root %s is not a directory\n" \
510             %(maildir_root))
511         sys.exit(1)
512 except:
513     try:
514         os.mkdir(maildir_root)
515     except:
516         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
517         sys.exit(1)
518
519 feeds = scp.sections()
520 try:
521     feeds.remove("general")
522 except:
523     pass
524
525 for section in feeds:
526     # check if the directory exists
527     maildir = None
528     try:
529         maildir = scp.get(section, "maildir")
530     except:
531         maildir = section
532
533     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
534     maildir = os.path.join(maildir_root, maildir)
535
536     try:
537         exists = os.stat(maildir)
538         if stat.S_ISDIR(exists[stat.ST_MODE]):
539             # check if there's a new, cur and tmp directory
540             try:
541                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
542             except:
543                 os.mkdir(os.path.join(maildir, "cur"))
544                 if not stat.S_ISDIR(mode):
545                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
546             try:
547                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
548             except:
549                 os.mkdir(os.path.join(maildir, "tmp"))
550                 if not stat.S_ISDIR(mode):
551                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
552             try:
553                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
554                 if not stat.S_ISDIR(mode):
555                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
556             except:
557                 os.mkdir(os.path.join(maildir, "new"))
558         else:
559             sys.stderr.write("Broken maildir: %s\n" %(maildir))
560     except:
561         try:
562             os.mkdir(maildir)
563         except:
564             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
565             sys.exit(1)
566         try:
567             os.mkdir(os.path.join(maildir, "new"))
568             os.mkdir(os.path.join(maildir, "cur"))
569             os.mkdir(os.path.join(maildir, "tmp"))
570         except:
571             sys.stderr.write( \
572                 "Couldn't create required maildir directories for %s\n" \
573                 %(section,))
574             sys.exit(1)
575
576     # right - we've got the directories, we've got the section, we know the
577     # url... lets play!
578
579     parse_and_deliver(maildir, section, state_dir)