* serious reworking of the HTML2Text parser
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     blockleveltags = [
62         "h1",
63         "h2",
64         "h3",
65         "h4",
66         "h5",
67         "h6",
68         "pre",
69         "p",
70         "ul",
71         "ol",
72         "dl",
73         "br",
74         ]
75
76     liststarttags = [
77         "ul",
78         "ol",
79         "dl",
80         ]
81
82     cancontainflow = [
83         "div",
84         "li",
85         "dd",
86         "blockquote",
87     ]
88
89     def __init__(self,textwidth=70):
90         self.text = u''
91         self.curdata = u''
92         self.textwidth = textwidth
93         self.opentags = []
94         self.indentlevel = 0
95         HTMLParser.__init__(self)
96
97     def handle_starttag(self, tag, attrs):
98         tag_name = tag.lower()
99         if tag_name in self.blockleveltags:
100             # handle starting a new block - unless we're in a block element
101             # that can contain other blocks, we'll assume that we want to close
102             # the container
103             if tag_name == u'br':
104                 self.handle_curdata()
105                 self.opentags.append(tag_name)
106                 self.opentags.pop()
107
108             if len(self.opentags) > 0:
109                 self.handle_curdata()
110                 self.opentags.pop()
111             self.opentags.append(tag_name)
112         else:
113             self.handle_curdata()
114             self.opentags.append(tag_name)
115
116     def handle_startendtag(self, tag, attrs):
117         if tag.lower() == u'br':
118             self.tags.append(u'br')
119             self.handle_curdata() # just handle the data, don't do anything else
120             self.tags.pop()
121
122     def handle_curdata(self):
123         if len(self.opentags) == 0:
124             return
125
126         tag_thats_done = self.opentags[-1]
127
128         if tag_thats_done in self.blockleveltags:
129             newlinerequired = self.text != u''
130             if newlinerequired:
131                 self.text = self.text + u'\n\n'
132
133         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
134             underline = u''
135             underlinechar = u'='
136             headingtext = self.curdata.encode("utf-8").strip()
137             headingtext = u'\n'.join( \
138                 textwrap.wrap(headingtext, self.textwidth))
139
140             if tag_thats_done == u'h2':
141                 underlinechar = u'-'
142             elif tag_thats_done != u'h1':
143                 underlinechar = u'~'
144
145             if u'\n' in headingtext:
146                 underline = underlinechar * self.textwidth
147             else:
148                 underline = underlinechar * len(headingtext)
149             self.text = self.text \
150                 + headingtext.encode("utf-8") + u'\n' \
151                 + underline
152         elif tag_thats_done == "p":
153             paragraph = self.curdata.encode("utf-8").strip()
154             self.text = self.text \
155                 + u'\n'.join(textwrap.wrap(paragraph, self.textwidth))
156         elif tag_thats_done == "pre":
157             self.text = self.text + self.curdata
158         elif tag_thats_done == "blockquote":
159             quote = self.curdata.encode("utf-8").strip()
160             self.text = self.text \
161                 + u'> ' \
162                 + u'> '.join(textwrap.wrap(quote, self.textwidth - 2))
163         elif tag_thats_done == "li":
164             item = self.curdata.encode("utf-8").strip()
165             if len(self.text) > 0 and self.text[-1] != u'\n':
166                 self.text = self.text + u'\n'
167             self.text = self.text \
168                 + u' * ' \
169                 + u'\n   '.join( \
170                     textwrap.wrap(item, self.textwidth - 3))
171             self.curdata = u''
172         elif tag_thats_done in self.liststarttags:
173             pass
174         else:
175             # we've got no idea what this tag does, so we'll
176             # make an assumption that we're not going to know later
177             if len(self.curdata) > 0:
178                 self.text = self.text \
179                     + u' ... ' \
180                     + u'\n ... '.join( \
181                         textwrap.wrap(self.curdata, self.textwidth - 5))
182             self.curdata = u''
183
184         if tag_thats_done in self.blockleveltags:
185             self.curdata = u''
186
187     def handle_endtag(self, tag):
188         try:
189             tagindex = self.opentags.index(tag)
190         except:
191             # closing tag we know nothing about.
192             # err. weird.
193             tagindex = 0
194
195         while tagindex < len(self.opentags) \
196             and tag in self.opentags[tagindex+1:]:
197             try:
198                 tagindex = self.opentags.index(tag, tagindex+1)
199             except:
200                 # well, we don't want to do that then
201                 pass
202         if tagindex != len(self.opentags) - 1:
203             # Assuming the data was for the last opened tag first
204             self.handle_curdata()
205             # Now kill the list to be a slice before this tag was opened
206             self.opentags = self.opentags[:tagindex]
207
208     def handle_data(self, data):
209         self.curdata = self.curdata + unicode(data, "utf-8")
210
211     def handle_entityref(self, name):
212         entity = name
213         if HTML2Text.entities.has_key(name.lower()):
214             entity = HTML2Text.entities[name.lower()]
215         elif name[0] == "#":
216             entity = unichr(int(name[1:]))
217         else:
218             entity = "&" + name + ";"
219
220         self.curdata = self.curdata + unicode(entity, "utf-8")
221
222     def gettext(self):
223         self.handle_curdata()
224         if len(self.text) == 0 or self.text[-1] != u'\n':
225             self.text = self.text + u'\n'
226         self.opentags = []
227         return self.text
228
229 def open_url(method, url):
230     redirectcount = 0
231     while redirectcount < 3:
232         (type, rest) = urllib.splittype(url)
233         (host, path) = urllib.splithost(rest)
234         (host, port) = urllib.splitport(host)
235         if port == None:
236             port = 80
237         try:
238             conn = httplib.HTTPConnection("%s:%s" %(host, port))
239             conn.request(method, path)
240             response = conn.getresponse()
241             if response.status in [301, 302, 303, 307]:
242                 headers = response.getheaders()
243                 for header in headers:
244                     if header[0] == "location":
245                         url = header[1]
246             elif response.status == 200:
247                 return response
248         except:
249             pass
250         redirectcount = redirectcount + 1
251     return None
252
253 def parse_and_deliver(maildir, url, statedir):
254     feedhandle = None
255     headers = None
256     # first check if we know about this feed already
257     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
258     if feeddb.has_key(url):
259         data = feeddb[url]
260         data = cgi.parse_qs(data)
261         response = open_url("HEAD", url)
262         headers = None
263         if response:
264             headers = response.getheaders()
265         ischanged = False
266         try:
267             for header in headers:
268                 if header[0] == "content-length":
269                     if header[1] != data["content-length"][0]:
270                         ischanged = True
271                 elif header[0] == "etag":
272                     if header[1] != data["etag"][0]:
273                         ischanged = True
274                 elif header[0] == "last-modified":
275                     if header[1] != data["last-modified"][0]:
276                         ischanged = True
277                 elif header[0] == "content-md5":
278                     if header[1] != data["content-md5"][0]:
279                         ischanged = True
280         except:
281             ischanged = True
282         if ischanged:
283             response = open_url("GET", url)
284             if response != None:
285                 headers = response.getheaders()
286                 feedhandle = response
287             else:
288                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
289                 return
290         else:
291             return # don't need to do anything, nothings changed.
292     else:
293         response = open_url("GET", url)
294         if response != None:
295             headers = response.getheaders()
296             feedhandle = response
297         else:
298             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
299             return
300
301     fp = feedparser.parse(feedhandle)
302     db = dbm.open(os.path.join(statedir, "seen"), "c")
303     for item in fp["items"]:
304         # have we seen it before?
305         # need to work out what the content is first...
306
307         if item.has_key("content"):
308             content = item["content"][0]["value"]
309         else:
310             content = item["summary"]
311
312         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
313
314         prevmessageid = None
315
316         # check if there's a guid too - if that exists and we match the md5,
317         # return
318         if item.has_key("guid"):
319             if db.has_key(url + "|" + item["guid"]):
320                 data = db[url + "|" + item["guid"]]
321                 data = cgi.parse_qs(data)
322                 if data["contentmd5"][0] == md5sum:
323                     continue
324
325         if db.has_key(url + "|" + item["link"]):
326             data = db[url + "|" + item["link"]]
327             data = cgi.parse_qs(data)
328             if data.has_key("message-id"):
329                 prevmessageid = data["message-id"][0]
330             if data["contentmd5"][0] == md5sum:
331                 continue
332
333         try:
334             author = item["author"]
335         except:
336             author = url
337
338         # create a basic email message
339         msg = MIMEMultipart("alternative")
340         messageid = "<" \
341             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
342             + "." \
343             + "".join( \
344                 [random.choice( \
345                     string.ascii_letters + string.digits \
346                     ) for a in range(0,6) \
347                 ]) + "@" + socket.gethostname() + ">"
348         msg.add_header("Message-ID", messageid)
349         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
350         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
351         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
352         if prevmessageid:
353             msg.add_header("References", prevmessageid)
354         createddate = datetime.datetime.now() \
355             .strftime("%a, %e %b %Y %T -0000")
356         try:
357             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
358                 .strftime("%a, %e %b %Y %T -0000")
359         except:
360             pass
361         msg.add_header("Date", createddate)
362         msg.add_header("Subject", item["title"])
363         msg.set_default_type("text/plain")
364
365         htmlcontent = content.encode("utf-8")
366         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
367             content, \
368             item["link"], \
369             item["link"] )
370         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
371         textparser = HTML2Text()
372         textparser.feed(content.encode("utf-8"))
373         textcontent = textparser.gettext()
374         textcontent = "%s\n\nItem URL: %s" %( \
375             textcontent, \
376             item["link"] )
377         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
378         msg.attach(textpart)
379         msg.attach(htmlpart)
380
381         # start by working out the filename we should be writting to, we do
382         # this following the normal maildir style rules
383         fname = str(os.getpid()) \
384             + "." + socket.gethostname() \
385             + "." + "".join( \
386                 [random.choice( \
387                     string.ascii_letters + string.digits \
388                     ) for a in range(0,10) \
389                 ]) + "." \
390             + datetime.datetime.now().strftime('%s')
391         fn = os.path.join(maildir, "tmp", fname)
392         fh = open(fn, "w")
393         fh.write(msg.as_string())
394         fh.close()
395         # now move it in to the new directory
396         newfn = os.path.join(maildir, "new", fname)
397         os.link(fn, newfn)
398         os.unlink(fn)
399
400         # now add to the database about the item
401         if prevmessageid:
402             messageid = prevmessageid + " " + messageid
403         if item.has_key("guid") and item["guid"] != item["link"]:
404             data = urllib.urlencode(( \
405                 ("message-id", messageid), \
406                 ("created", createddate), \
407                 ("contentmd5", md5sum) \
408                 ))
409             db[url + "|" + item["guid"]] = data
410             try:
411                 data = db[url + "|" + item["link"]]
412                 data = cgi.parse_qs(data)
413                 newdata = urllib.urlencode(( \
414                     ("message-id", messageid), \
415                     ("created", data["created"][0]), \
416                     ("contentmd5", data["contentmd5"][0]) \
417                     ))
418                 db[url + "|" + item["link"]] = newdata
419             except:
420                 db[url + "|" + item["link"]] = data
421         else:
422             data = urllib.urlencode(( \
423                 ("message-id", messageid), \
424                 ("created", createddate), \
425                 ("contentmd5", md5sum) \
426                 ))
427             db[url + "|" + item["link"]] = data
428
429     if headers:
430         data = []
431         for header in headers:
432             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
433                 data.append((header[0], header[1]))
434         if len(data) > 0:
435             data = urllib.urlencode(data)
436             feeddb[url] = data
437
438     db.close()
439     feeddb.close()
440
441 if __name__ == "__main__":
442     # This only gets executed if we really called the program
443     # first off, parse the command line arguments
444
445     oparser = OptionParser()
446     oparser.add_option(
447         "-c", "--conf", dest="conf",
448         help="location of config file"
449         )
450     oparser.add_option(
451         "-s", "--statedir", dest="statedir",
452         help="location of directory to store state in"
453         )
454
455     (options, args) = oparser.parse_args()
456
457     # check for the configfile
458
459     configfile = None
460
461     if options.conf != None:
462         # does the file exist?
463         try:
464             os.stat(options.conf)
465             configfile = options.conf
466         except:
467             # should exit here as the specified file doesn't exist
468             sys.stderr.write( \
469                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
470             sys.exit(2)
471     else:
472         # check through the default locations
473         try:
474             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
475             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
476         except:
477             try:
478                 os.stat("/etc/rss2maildir.conf")
479                 configfile = "/etc/rss2maildir.conf"
480             except:
481                 sys.stderr.write("No config file found. Exiting.\n")
482                 sys.exit(2)
483
484     # Right - if we've got this far, we've got a config file, now for the hard
485     # bits...
486
487     scp = SafeConfigParser()
488     scp.read(configfile)
489
490     maildir_root = "RSSMaildir"
491     state_dir = "state"
492
493     if options.statedir != None:
494         state_dir = options.statedir
495         try:
496             mode = os.stat(state_dir)[stat.ST_MODE]
497             if not stat.S_ISDIR(mode):
498                 sys.stderr.write( \
499                     "State directory (%s) is not a directory\n" %(state_dir))
500                 sys.exit(1)
501         except:
502             # try to make the directory
503             try:
504                 os.mkdir(state_dir)
505             except:
506                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
507                 sys.exit(1)
508     elif scp.has_option("general", "state_dir"):
509         new_state_dir = scp.get("general", "state_dir")
510         try:
511             mode = os.stat(state_dir)[stat.ST_MODE]
512             if not stat.S_ISDIR(mode):
513                 sys.stderr.write( \
514                     "State directory (%s) is not a directory\n" %(state_dir))
515                 sys.exit(1)
516         except:
517             # try to create it
518             try:
519                 os.mkdir(new_state_dir)
520                 state_dir = new_state_dir
521             except:
522                 sys.stderr.write( \
523                     "Couldn't create state directory %s\n" %(new_state_dir))
524                 sys.exit(1)
525     else:
526         try:
527             mode = os.stat(state_dir)[stat.ST_MODE]
528             if not stat.S_ISDIR(mode):
529                 sys.stderr.write( \
530                     "State directory %s is not a directory\n" %(state_dir))
531                 sys.exit(1)
532         except:
533             try:
534                 os.mkdir(state_dir)
535             except:
536                 sys.stderr.write( \
537                     "State directory %s could not be created\n" %(state_dir))
538                 sys.exit(1)
539
540     if scp.has_option("general", "maildir_root"):
541         maildir_root = scp.get("general", "maildir_root")
542
543     try:
544         mode = os.stat(maildir_root)[stat.ST_MODE]
545         if not stat.S_ISDIR(mode):
546             sys.stderr.write( \
547                 "Maildir Root %s is not a directory\n" \
548                 %(maildir_root))
549             sys.exit(1)
550     except:
551         try:
552             os.mkdir(maildir_root)
553         except:
554             sys.stderr.write("Couldn't create Maildir Root %s\n" \
555                 %(maildir_root))
556             sys.exit(1)
557
558     feeds = scp.sections()
559     try:
560         feeds.remove("general")
561     except:
562         pass
563
564     for section in feeds:
565         # check if the directory exists
566         maildir = None
567         try:
568             maildir = scp.get(section, "maildir")
569         except:
570             maildir = section
571
572         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
573         maildir = os.path.join(maildir_root, maildir)
574
575         try:
576             exists = os.stat(maildir)
577             if stat.S_ISDIR(exists[stat.ST_MODE]):
578                 # check if there's a new, cur and tmp directory
579                 try:
580                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
581                 except:
582                     os.mkdir(os.path.join(maildir, "cur"))
583                     if not stat.S_ISDIR(mode):
584                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
585                 try:
586                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
587                 except:
588                     os.mkdir(os.path.join(maildir, "tmp"))
589                     if not stat.S_ISDIR(mode):
590                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
591                 try:
592                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
593                     if not stat.S_ISDIR(mode):
594                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
595                 except:
596                     os.mkdir(os.path.join(maildir, "new"))
597             else:
598                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
599         except:
600             try:
601                 os.mkdir(maildir)
602             except:
603                 sys.stderr.write("Couldn't create root maildir %s\n" \
604                     %(maildir))
605                 sys.exit(1)
606             try:
607                 os.mkdir(os.path.join(maildir, "new"))
608                 os.mkdir(os.path.join(maildir, "cur"))
609                 os.mkdir(os.path.join(maildir, "tmp"))
610             except:
611                 sys.stderr.write( \
612                     "Couldn't create required maildir directories for %s\n" \
613                     %(section,))
614                 sys.exit(1)
615
616         # right - we've got the directories, we've got the section, we know the
617         # url... lets play!
618
619         parse_and_deliver(maildir, section, state_dir)