* Small improvements to the HTML2Text code
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     blockleveltags = [
62         "h1",
63         "h2",
64         "h3",
65         "h4",
66         "h5",
67         "h6",
68         "pre",
69         "p",
70         "ul",
71         "ol",
72         "dl",
73         "br",
74         ]
75
76     liststarttags = [
77         "ul",
78         "ol",
79         "dl",
80         ]
81
82     cancontainflow = [
83         "div",
84         "li",
85         "dd",
86         "blockquote",
87     ]
88
89     def __init__(self,textwidth=70):
90         self.text = u''
91         self.curdata = u''
92         self.textwidth = textwidth
93         self.opentags = []
94         self.indentlevel = 0
95         HTMLParser.__init__(self)
96
97     def handle_starttag(self, tag, attrs):
98         tag_name = tag.lower()
99         if tag_name in self.blockleveltags:
100             # handle starting a new block - unless we're in a block element
101             # that can contain other blocks, we'll assume that we want to close
102             # the container
103             if tag_name == u'br':
104                 self.handle_curdata()
105                 self.opentags.append(tag_name)
106                 self.opentags.pop()
107
108             if len(self.opentags) > 0:
109                 self.handle_curdata()
110                 self.opentags.pop()
111             self.opentags.append(tag_name)
112         else:
113             self.handle_curdata()
114             self.opentags.append(tag_name)
115
116     def handle_startendtag(self, tag, attrs):
117         if tag.lower() == u'br':
118             self.tags.append(u'br')
119             self.handle_curdata() # just handle the data, don't do anything else
120             self.tags.pop()
121
122     def handle_curdata(self):
123         if len(self.opentags) == 0:
124             return
125
126         if len(self.curdata) == 0:
127             return
128
129         tag_thats_done = self.opentags[-1]
130
131         if tag_thats_done in self.blockleveltags:
132             newlinerequired = self.text != u''
133             if newlinerequired:
134                 self.text = self.text + u'\n\n'
135
136         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
137             underline = u''
138             underlinechar = u'='
139             headingtext = self.curdata.encode("utf-8").strip()
140             headingtext = u'\n'.join( \
141                 textwrap.wrap(headingtext, self.textwidth))
142
143             if tag_thats_done == u'h2':
144                 underlinechar = u'-'
145             elif tag_thats_done != u'h1':
146                 underlinechar = u'~'
147
148             if u'\n' in headingtext:
149                 underline = underlinechar * self.textwidth
150             else:
151                 underline = underlinechar * len(headingtext)
152             self.text = self.text \
153                 + headingtext.encode("utf-8") + u'\n' \
154                 + underline
155         elif tag_thats_done == "p":
156             paragraph = self.curdata.encode("utf-8").strip()
157             self.text = self.text \
158                 + u'\n'.join(textwrap.wrap(paragraph, self.textwidth))
159         elif tag_thats_done == "pre":
160             self.text = self.text + self.curdata
161         elif tag_thats_done == "blockquote":
162             quote = self.curdata.encode("utf-8").strip()
163             self.text = self.text \
164                 + u'> ' \
165                 + u'> '.join(textwrap.wrap(quote, self.textwidth - 2))
166         elif tag_thats_done == "li":
167             item = self.curdata.encode("utf-8").strip()
168             if len(self.text) > 0 and self.text[-1] != u'\n':
169                 self.text = self.text + u'\n'
170             self.text = self.text \
171                 + u' * ' \
172                 + u'\n   '.join( \
173                     textwrap.wrap(item, self.textwidth - 3))
174             self.curdata = u''
175         elif tag_thats_done == "dt":
176             definition = self.curdata.encode("utf-8").strip()
177             if len(self.text) > 0 and self.text[-1] != u'\n':
178                 self.text = self.text + u'\n\n'
179             elif len(self.text) > 0 and self.text[-2] != u'\n':
180                 self.text = self.text + u'\n'
181             definition = definition + "::"
182             self.text = self.text \
183                 + '\n '.join(
184                     textwrap.wrap(definition, self.textwidth - 1))
185             self.curdata = u''
186         elif tag_thats_done == "dd":
187             definition = self.curdata.encode("utf-8").strip()
188             if len(self.text) > 0 and self.text[-1] != u'\n':
189                 self.text = self.text + u'\n'
190             self.text = self.text \
191                 + '    ' \
192                 + '\n    '.join( \
193                     textwrap.wrap(definition, self.textwidth - 4))
194             self.curdata = u''
195         elif tag_thats_done in self.liststarttags:
196             pass
197         else:
198             # we've got no idea what this tag does, so we'll
199             # make an assumption that we're not going to know later
200             if len(self.curdata) > 0:
201                 self.text = self.text \
202                     + u' ... ' \
203                     + u'\n ... '.join( \
204                         textwrap.wrap(self.curdata, self.textwidth - 5))
205             self.curdata = u''
206
207         if tag_thats_done in self.blockleveltags:
208             self.curdata = u''
209
210     def handle_endtag(self, tag):
211         try:
212             tagindex = self.opentags.index(tag)
213         except:
214             # closing tag we know nothing about.
215             # err. weird.
216             tagindex = 0
217
218         while tagindex < len(self.opentags) \
219             and tag in self.opentags[tagindex+1:]:
220             try:
221                 tagindex = self.opentags.index(tag, tagindex+1)
222             except:
223                 # well, we don't want to do that then
224                 pass
225         if tagindex != len(self.opentags) - 1:
226             # Assuming the data was for the last opened tag first
227             self.handle_curdata()
228             # Now kill the list to be a slice before this tag was opened
229             self.opentags = self.opentags[:tagindex]
230
231     def handle_data(self, data):
232         self.curdata = self.curdata + unicode(data, "utf-8")
233
234     def handle_entityref(self, name):
235         entity = name
236         if HTML2Text.entities.has_key(name.lower()):
237             entity = HTML2Text.entities[name.lower()]
238         elif name[0] == "#":
239             entity = unichr(int(name[1:]))
240         else:
241             entity = "&" + name + ";"
242
243         self.curdata = self.curdata + unicode(entity, "utf-8")
244
245     def gettext(self):
246         self.handle_curdata()
247         if len(self.text) == 0 or self.text[-1] != u'\n':
248             self.text = self.text + u'\n'
249         self.opentags = []
250         if len(self.text) > 0:
251             while len(self.text) > 1 and self.text[-1] == u'\n':
252                 self.text = self.text[:-1]
253             self.text = self.text + u'\n'
254         return self.text
255
256 def open_url(method, url):
257     redirectcount = 0
258     while redirectcount < 3:
259         (type, rest) = urllib.splittype(url)
260         (host, path) = urllib.splithost(rest)
261         (host, port) = urllib.splitport(host)
262         if port == None:
263             port = 80
264         try:
265             conn = httplib.HTTPConnection("%s:%s" %(host, port))
266             conn.request(method, path)
267             response = conn.getresponse()
268             if response.status in [301, 302, 303, 307]:
269                 headers = response.getheaders()
270                 for header in headers:
271                     if header[0] == "location":
272                         url = header[1]
273             elif response.status == 200:
274                 return response
275         except:
276             pass
277         redirectcount = redirectcount + 1
278     return None
279
280 def parse_and_deliver(maildir, url, statedir):
281     feedhandle = None
282     headers = None
283     # first check if we know about this feed already
284     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
285     if feeddb.has_key(url):
286         data = feeddb[url]
287         data = cgi.parse_qs(data)
288         response = open_url("HEAD", url)
289         headers = None
290         if response:
291             headers = response.getheaders()
292         ischanged = False
293         try:
294             for header in headers:
295                 if header[0] == "content-length":
296                     if header[1] != data["content-length"][0]:
297                         ischanged = True
298                 elif header[0] == "etag":
299                     if header[1] != data["etag"][0]:
300                         ischanged = True
301                 elif header[0] == "last-modified":
302                     if header[1] != data["last-modified"][0]:
303                         ischanged = True
304                 elif header[0] == "content-md5":
305                     if header[1] != data["content-md5"][0]:
306                         ischanged = True
307         except:
308             ischanged = True
309         if ischanged:
310             response = open_url("GET", url)
311             if response != None:
312                 headers = response.getheaders()
313                 feedhandle = response
314             else:
315                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
316                 return
317         else:
318             return # don't need to do anything, nothings changed.
319     else:
320         response = open_url("GET", url)
321         if response != None:
322             headers = response.getheaders()
323             feedhandle = response
324         else:
325             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
326             return
327
328     fp = feedparser.parse(feedhandle)
329     db = dbm.open(os.path.join(statedir, "seen"), "c")
330     for item in fp["items"]:
331         # have we seen it before?
332         # need to work out what the content is first...
333
334         if item.has_key("content"):
335             content = item["content"][0]["value"]
336         else:
337             content = item["summary"]
338
339         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
340
341         prevmessageid = None
342
343         # check if there's a guid too - if that exists and we match the md5,
344         # return
345         if item.has_key("guid"):
346             if db.has_key(url + "|" + item["guid"]):
347                 data = db[url + "|" + item["guid"]]
348                 data = cgi.parse_qs(data)
349                 if data["contentmd5"][0] == md5sum:
350                     continue
351
352         if db.has_key(url + "|" + item["link"]):
353             data = db[url + "|" + item["link"]]
354             data = cgi.parse_qs(data)
355             if data.has_key("message-id"):
356                 prevmessageid = data["message-id"][0]
357             if data["contentmd5"][0] == md5sum:
358                 continue
359
360         try:
361             author = item["author"]
362         except:
363             author = url
364
365         # create a basic email message
366         msg = MIMEMultipart("alternative")
367         messageid = "<" \
368             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
369             + "." \
370             + "".join( \
371                 [random.choice( \
372                     string.ascii_letters + string.digits \
373                     ) for a in range(0,6) \
374                 ]) + "@" + socket.gethostname() + ">"
375         msg.add_header("Message-ID", messageid)
376         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
377         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
378         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
379         if prevmessageid:
380             msg.add_header("References", prevmessageid)
381         createddate = datetime.datetime.now() \
382             .strftime("%a, %e %b %Y %T -0000")
383         try:
384             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
385                 .strftime("%a, %e %b %Y %T -0000")
386         except:
387             pass
388         msg.add_header("Date", createddate)
389         msg.add_header("Subject", item["title"])
390         msg.set_default_type("text/plain")
391
392         htmlcontent = content.encode("utf-8")
393         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
394             content, \
395             item["link"], \
396             item["link"] )
397         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
398         textparser = HTML2Text()
399         textparser.feed(content.encode("utf-8"))
400         textcontent = textparser.gettext()
401         textcontent = "%s\n\nItem URL: %s" %( \
402             textcontent, \
403             item["link"] )
404         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
405         msg.attach(textpart)
406         msg.attach(htmlpart)
407
408         # start by working out the filename we should be writting to, we do
409         # this following the normal maildir style rules
410         fname = str(os.getpid()) \
411             + "." + socket.gethostname() \
412             + "." + "".join( \
413                 [random.choice( \
414                     string.ascii_letters + string.digits \
415                     ) for a in range(0,10) \
416                 ]) + "." \
417             + datetime.datetime.now().strftime('%s')
418         fn = os.path.join(maildir, "tmp", fname)
419         fh = open(fn, "w")
420         fh.write(msg.as_string())
421         fh.close()
422         # now move it in to the new directory
423         newfn = os.path.join(maildir, "new", fname)
424         os.link(fn, newfn)
425         os.unlink(fn)
426
427         # now add to the database about the item
428         if prevmessageid:
429             messageid = prevmessageid + " " + messageid
430         if item.has_key("guid") and item["guid"] != item["link"]:
431             data = urllib.urlencode(( \
432                 ("message-id", messageid), \
433                 ("created", createddate), \
434                 ("contentmd5", md5sum) \
435                 ))
436             db[url + "|" + item["guid"]] = data
437             try:
438                 data = db[url + "|" + item["link"]]
439                 data = cgi.parse_qs(data)
440                 newdata = urllib.urlencode(( \
441                     ("message-id", messageid), \
442                     ("created", data["created"][0]), \
443                     ("contentmd5", data["contentmd5"][0]) \
444                     ))
445                 db[url + "|" + item["link"]] = newdata
446             except:
447                 db[url + "|" + item["link"]] = data
448         else:
449             data = urllib.urlencode(( \
450                 ("message-id", messageid), \
451                 ("created", createddate), \
452                 ("contentmd5", md5sum) \
453                 ))
454             db[url + "|" + item["link"]] = data
455
456     if headers:
457         data = []
458         for header in headers:
459             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
460                 data.append((header[0], header[1]))
461         if len(data) > 0:
462             data = urllib.urlencode(data)
463             feeddb[url] = data
464
465     db.close()
466     feeddb.close()
467
468 if __name__ == "__main__":
469     # This only gets executed if we really called the program
470     # first off, parse the command line arguments
471
472     oparser = OptionParser()
473     oparser.add_option(
474         "-c", "--conf", dest="conf",
475         help="location of config file"
476         )
477     oparser.add_option(
478         "-s", "--statedir", dest="statedir",
479         help="location of directory to store state in"
480         )
481
482     (options, args) = oparser.parse_args()
483
484     # check for the configfile
485
486     configfile = None
487
488     if options.conf != None:
489         # does the file exist?
490         try:
491             os.stat(options.conf)
492             configfile = options.conf
493         except:
494             # should exit here as the specified file doesn't exist
495             sys.stderr.write( \
496                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
497             sys.exit(2)
498     else:
499         # check through the default locations
500         try:
501             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
502             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
503         except:
504             try:
505                 os.stat("/etc/rss2maildir.conf")
506                 configfile = "/etc/rss2maildir.conf"
507             except:
508                 sys.stderr.write("No config file found. Exiting.\n")
509                 sys.exit(2)
510
511     # Right - if we've got this far, we've got a config file, now for the hard
512     # bits...
513
514     scp = SafeConfigParser()
515     scp.read(configfile)
516
517     maildir_root = "RSSMaildir"
518     state_dir = "state"
519
520     if options.statedir != None:
521         state_dir = options.statedir
522         try:
523             mode = os.stat(state_dir)[stat.ST_MODE]
524             if not stat.S_ISDIR(mode):
525                 sys.stderr.write( \
526                     "State directory (%s) is not a directory\n" %(state_dir))
527                 sys.exit(1)
528         except:
529             # try to make the directory
530             try:
531                 os.mkdir(state_dir)
532             except:
533                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
534                 sys.exit(1)
535     elif scp.has_option("general", "state_dir"):
536         new_state_dir = scp.get("general", "state_dir")
537         try:
538             mode = os.stat(state_dir)[stat.ST_MODE]
539             if not stat.S_ISDIR(mode):
540                 sys.stderr.write( \
541                     "State directory (%s) is not a directory\n" %(state_dir))
542                 sys.exit(1)
543         except:
544             # try to create it
545             try:
546                 os.mkdir(new_state_dir)
547                 state_dir = new_state_dir
548             except:
549                 sys.stderr.write( \
550                     "Couldn't create state directory %s\n" %(new_state_dir))
551                 sys.exit(1)
552     else:
553         try:
554             mode = os.stat(state_dir)[stat.ST_MODE]
555             if not stat.S_ISDIR(mode):
556                 sys.stderr.write( \
557                     "State directory %s is not a directory\n" %(state_dir))
558                 sys.exit(1)
559         except:
560             try:
561                 os.mkdir(state_dir)
562             except:
563                 sys.stderr.write( \
564                     "State directory %s could not be created\n" %(state_dir))
565                 sys.exit(1)
566
567     if scp.has_option("general", "maildir_root"):
568         maildir_root = scp.get("general", "maildir_root")
569
570     try:
571         mode = os.stat(maildir_root)[stat.ST_MODE]
572         if not stat.S_ISDIR(mode):
573             sys.stderr.write( \
574                 "Maildir Root %s is not a directory\n" \
575                 %(maildir_root))
576             sys.exit(1)
577     except:
578         try:
579             os.mkdir(maildir_root)
580         except:
581             sys.stderr.write("Couldn't create Maildir Root %s\n" \
582                 %(maildir_root))
583             sys.exit(1)
584
585     feeds = scp.sections()
586     try:
587         feeds.remove("general")
588     except:
589         pass
590
591     for section in feeds:
592         # check if the directory exists
593         maildir = None
594         try:
595             maildir = scp.get(section, "maildir")
596         except:
597             maildir = section
598
599         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
600         maildir = os.path.join(maildir_root, maildir)
601
602         try:
603             exists = os.stat(maildir)
604             if stat.S_ISDIR(exists[stat.ST_MODE]):
605                 # check if there's a new, cur and tmp directory
606                 try:
607                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
608                 except:
609                     os.mkdir(os.path.join(maildir, "cur"))
610                     if not stat.S_ISDIR(mode):
611                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
612                 try:
613                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
614                 except:
615                     os.mkdir(os.path.join(maildir, "tmp"))
616                     if not stat.S_ISDIR(mode):
617                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
618                 try:
619                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
620                     if not stat.S_ISDIR(mode):
621                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
622                 except:
623                     os.mkdir(os.path.join(maildir, "new"))
624             else:
625                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
626         except:
627             try:
628                 os.mkdir(maildir)
629             except:
630                 sys.stderr.write("Couldn't create root maildir %s\n" \
631                     %(maildir))
632                 sys.exit(1)
633             try:
634                 os.mkdir(os.path.join(maildir, "new"))
635                 os.mkdir(os.path.join(maildir, "cur"))
636                 os.mkdir(os.path.join(maildir, "tmp"))
637             except:
638                 sys.stderr.write( \
639                     "Couldn't create required maildir directories for %s\n" \
640                     %(section,))
641                 sys.exit(1)
642
643         # right - we've got the directories, we've got the section, we know the
644         # url... lets play!
645
646         parse_and_deliver(maildir, section, state_dir)