* Serious reworking of HTML2Text to handle nested lists reasonably
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     blockleveltags = [
62         "h1",
63         "h2",
64         "h3",
65         "h4",
66         "h5",
67         "h6",
68         "pre",
69         "p",
70         "ul",
71         "ol",
72         "dl",
73         "br",
74         ]
75
76     liststarttags = [
77         "ul",
78         "ol",
79         "dl",
80         ]
81
82     cancontainflow = [
83         "div",
84         "li",
85         "dd",
86         "blockquote",
87     ]
88
89     def __init__(self,textwidth=70):
90         self.text = u''
91         self.curdata = u''
92         self.textwidth = textwidth
93         self.opentags = []
94         self.indentlevel = 0
95         self.listcount = []
96         HTMLParser.__init__(self)
97
98     def handle_starttag(self, tag, attrs):
99         tag_name = tag.lower()
100         if tag_name in self.blockleveltags:
101             # handle starting a new block - unless we're in a block element
102             # that can contain other blocks, we'll assume that we want to close
103             # the container
104             if tag_name == u'br':
105                 self.handle_curdata()
106                 self.opentags.append(tag_name)
107                 self.opentags.pop()
108
109             if tag_name == u'ol':
110                 self.handle_curdata()
111                 self.listcount.append(1)
112                 self.listlevel = len(self.listcount) - 1
113
114             if tag_name in self.liststarttags:
115                 smallist = self.opentags[-3:]
116                 smallist.reverse()
117                 for prev_listtag in smallist:
118                     if prev_listtag in [u'dl', u'ol']:
119                         self.indentlevel = self.indentlevel + 4
120                         break
121                     elif prev_listtag == u'ul':
122                         self.indentlevel = self.indentlevel + 3
123                         break
124
125             if len(self.opentags) > 0:
126                 self.handle_curdata()
127                 if tag_name not in self.cancontainflow:
128                     self.opentags.pop()
129             self.opentags.append(tag_name)
130         else:
131             listcount = 0
132             try:
133                 listcount = self.listcount[-1]
134             except:
135                 pass
136             self.handle_curdata()
137             self.opentags.append(tag_name)
138
139     def handle_startendtag(self, tag, attrs):
140         if tag.lower() == u'br':
141             self.tags.append(u'br')
142             self.handle_curdata() # just handle the data, don't do anything else
143             self.tags.pop()
144
145     def handle_curdata(self):
146         if len(self.opentags) == 0:
147             return
148
149         if len(self.curdata) == 0:
150             return
151
152         if len(self.curdata.strip()) == 0:
153             return
154
155         tag_thats_done = self.opentags[-1]
156
157         if tag_thats_done in self.blockleveltags:
158             newlinerequired = self.text != u''
159             if newlinerequired:
160                 if newlinerequired \
161                     and len(self.text) > 2 \
162                     and self.text[-1] != u'\n' \
163                     and self.text[-2] != u'\n':
164                     self.text = self.text + u'\n\n'
165
166         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
167             underline = u''
168             underlinechar = u'='
169             headingtext = self.curdata.encode("utf-8").strip()
170             seperator = u'\n' + u' '*self.indentlevel
171             headingtext = seperator.join( \
172                 textwrap.wrap( \
173                     headingtext, \
174                     self.textwidth - self.indentlevel \
175                     ) \
176                 )
177
178             if tag_thats_done == u'h2':
179                 underlinechar = u'-'
180             elif tag_thats_done != u'h1':
181                 underlinechar = u'~'
182
183             if u'\n' in headingtext:
184                 underline = u' ' * self.indentlevel \
185                     + underlinechar * (self.textwidth - self.indentlevel)
186             else:
187                 underline = u' ' * self.indentlevel \
188                     + underlinechar * len(headingtext)
189             self.text = self.text \
190                 + headingtext.encode("utf-8") + u'\n' \
191                 + underline
192         elif tag_thats_done == "p":
193             paragraph = self.curdata.encode("utf-8").strip()
194             seperator = u'\n' + u' ' * self.indentlevel
195             self.text = self.text \
196                 + u' ' * self.indentlevel \
197                 + seperator.join(textwrap.wrap(paragraph, self.textwidth - self.indentlevel))
198         elif tag_thats_done == "pre":
199             self.text = self.text + self.curdata
200         elif tag_thats_done == "blockquote":
201             quote = self.curdata.encode("utf-8").strip()
202             seperator = u'\n' + u' ' * self.indentlevel + u'> '
203             self.text = self.text \
204                 + u'> ' \
205                 + seperator.join( \
206                     textwrap.wrap( \
207                         quote, \
208                         self.textwidth - self.indentlevel - 2 \
209                     )
210                 )
211         elif tag_thats_done == "li":
212             item = self.curdata.encode("utf-8").strip()
213             if len(self.text) > 0 and self.text[-1] != u'\n':
214                 self.text = self.text + u'\n'
215             # work out if we're in an ol rather than a ul
216             latesttags = self.opentags[-4:]
217             latesttags.reverse()
218             isul = False
219             for thing in latesttags:
220                 if thing == 'ul':
221                     isul = True
222                     break
223                 elif thing == 'ol':
224                     isul = False
225                     break
226
227             listindent = 3
228             if not isul:
229                 listindent = 4
230
231             listmarker = u' * '
232             if not isul:
233                 listmarker = u' %2d. ' %(self.listcount[-1])
234                 self.listcount[-1] = self.listcount[-1] + 1
235
236             seperator = u'\n' \
237                 + u' ' * self.indentlevel \
238                 + u' ' * listindent
239             self.text = self.text \
240                 + u' ' * self.indentlevel \
241                 + listmarker \
242                 + seperator.join( \
243                     textwrap.wrap( \
244                         item, \
245                         self.textwidth - self.indentlevel - listindent \
246                     ) \
247                 )
248             self.curdata = u''
249         elif tag_thats_done == "dt":
250             definition = self.curdata.encode("utf-8").strip()
251             if len(self.text) > 0 and self.text[-1] != u'\n':
252                 self.text = self.text + u'\n\n'
253             elif len(self.text) > 1 and self.text[-2] != u'\n':
254                 self.text = self.text + u'\n'
255             definition = definition + "::"
256             self.text = self.text \
257                 + '\n '.join(
258                     textwrap.wrap(definition, self.textwidth - 1))
259             self.curdata = u''
260         elif tag_thats_done == "dd":
261             definition = self.curdata.encode("utf-8").strip()
262             if len(definition) > 0:
263                 if len(self.text) > 0 and self.text[-1] != u'\n':
264                     self.text = self.text + u'\n'
265                 self.text = self.text \
266                     + '    ' \
267                     + '\n    '.join( \
268                         textwrap.wrap( \
269                             definition, \
270                             self.textwidth - self.indentlevel - 4 \
271                             ) \
272                         )
273                 self.curdata = u''
274         elif tag_thats_done in self.liststarttags:
275             pass
276         else:
277             # we've got no idea what this tag does, so we'll
278             # make an assumption that we're not going to know later
279             if len(self.curdata) > 0:
280                 self.text = self.text \
281                     + u' ... ' \
282                     + u'\n ... '.join( \
283                         textwrap.wrap(self.curdata, self.textwidth - 5))
284             self.curdata = u''
285
286         if tag_thats_done in self.blockleveltags:
287             self.curdata = u''
288
289     def handle_endtag(self, tag):
290         try:
291             tagindex = self.opentags.index(tag)
292         except:
293             # closing tag we know nothing about.
294             # err. weird.
295             tagindex = 0
296
297         tag = tag.lower()
298
299         if tag in self.liststarttags:
300             if tag in [u'ol', u'dl', u'ul']:
301                 # find if there was a previous list level
302                 smalllist = self.opentags[:-1]
303                 smalllist.reverse()
304                 for prev_listtag in smalllist:
305                     if prev_listtag in [u'ol', u'dl']:
306                         self.indentlevel = self.indentlevel - 4
307                         break
308                     elif prev_listtag == u'ul':
309                         self.indentlevel = self.indentlevel - 3
310                         break
311
312         if tag == u'ol':
313             self.listcount = self.listcount[:-1]
314
315         while tagindex < len(self.opentags) \
316             and tag in self.opentags[tagindex+1:]:
317             try:
318                 tagindex = self.opentags.index(tag, tagindex+1)
319             except:
320                 # well, we don't want to do that then
321                 pass
322         if tagindex != len(self.opentags) - 1:
323             # Assuming the data was for the last opened tag first
324             self.handle_curdata()
325             # Now kill the list to be a slice before this tag was opened
326             self.opentags = self.opentags[:tagindex + 1]
327         else:
328             self.handle_curdata()
329             if self.opentags[-1] == tag:
330                 self.opentags.pop()
331
332     def handle_data(self, data):
333         self.curdata = self.curdata + unicode(data, "utf-8")
334
335     def handle_entityref(self, name):
336         entity = name
337         if HTML2Text.entities.has_key(name.lower()):
338             entity = HTML2Text.entities[name.lower()]
339         elif name[0] == "#":
340             entity = unichr(int(name[1:]))
341         else:
342             entity = "&" + name + ";"
343
344         self.curdata = self.curdata + unicode(entity, "utf-8")
345
346     def gettext(self):
347         self.handle_curdata()
348         if len(self.text) == 0 or self.text[-1] != u'\n':
349             self.text = self.text + u'\n'
350         self.opentags = []
351         if len(self.text) > 0:
352             while len(self.text) > 1 and self.text[-1] == u'\n':
353                 self.text = self.text[:-1]
354             self.text = self.text + u'\n'
355         return self.text
356
357 def open_url(method, url):
358     redirectcount = 0
359     while redirectcount < 3:
360         (type, rest) = urllib.splittype(url)
361         (host, path) = urllib.splithost(rest)
362         (host, port) = urllib.splitport(host)
363         if port == None:
364             port = 80
365         try:
366             conn = httplib.HTTPConnection("%s:%s" %(host, port))
367             conn.request(method, path)
368             response = conn.getresponse()
369             if response.status in [301, 302, 303, 307]:
370                 headers = response.getheaders()
371                 for header in headers:
372                     if header[0] == "location":
373                         url = header[1]
374             elif response.status == 200:
375                 return response
376         except:
377             pass
378         redirectcount = redirectcount + 1
379     return None
380
381 def parse_and_deliver(maildir, url, statedir):
382     feedhandle = None
383     headers = None
384     # first check if we know about this feed already
385     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
386     if feeddb.has_key(url):
387         data = feeddb[url]
388         data = cgi.parse_qs(data)
389         response = open_url("HEAD", url)
390         headers = None
391         if response:
392             headers = response.getheaders()
393         ischanged = False
394         try:
395             for header in headers:
396                 if header[0] == "content-length":
397                     if header[1] != data["content-length"][0]:
398                         ischanged = True
399                 elif header[0] == "etag":
400                     if header[1] != data["etag"][0]:
401                         ischanged = True
402                 elif header[0] == "last-modified":
403                     if header[1] != data["last-modified"][0]:
404                         ischanged = True
405                 elif header[0] == "content-md5":
406                     if header[1] != data["content-md5"][0]:
407                         ischanged = True
408         except:
409             ischanged = True
410         if ischanged:
411             response = open_url("GET", url)
412             if response != None:
413                 headers = response.getheaders()
414                 feedhandle = response
415             else:
416                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
417                 return
418         else:
419             return # don't need to do anything, nothings changed.
420     else:
421         response = open_url("GET", url)
422         if response != None:
423             headers = response.getheaders()
424             feedhandle = response
425         else:
426             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
427             return
428
429     fp = feedparser.parse(feedhandle)
430     db = dbm.open(os.path.join(statedir, "seen"), "c")
431     for item in fp["items"]:
432         # have we seen it before?
433         # need to work out what the content is first...
434
435         if item.has_key("content"):
436             content = item["content"][0]["value"]
437         else:
438             content = item["summary"]
439
440         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
441
442         prevmessageid = None
443
444         # check if there's a guid too - if that exists and we match the md5,
445         # return
446         if item.has_key("guid"):
447             if db.has_key(url + "|" + item["guid"]):
448                 data = db[url + "|" + item["guid"]]
449                 data = cgi.parse_qs(data)
450                 if data["contentmd5"][0] == md5sum:
451                     continue
452
453         if db.has_key(url + "|" + item["link"]):
454             data = db[url + "|" + item["link"]]
455             data = cgi.parse_qs(data)
456             if data.has_key("message-id"):
457                 prevmessageid = data["message-id"][0]
458             if data["contentmd5"][0] == md5sum:
459                 continue
460
461         try:
462             author = item["author"]
463         except:
464             author = url
465
466         # create a basic email message
467         msg = MIMEMultipart("alternative")
468         messageid = "<" \
469             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
470             + "." \
471             + "".join( \
472                 [random.choice( \
473                     string.ascii_letters + string.digits \
474                     ) for a in range(0,6) \
475                 ]) + "@" + socket.gethostname() + ">"
476         msg.add_header("Message-ID", messageid)
477         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
478         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
479         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
480         if prevmessageid:
481             msg.add_header("References", prevmessageid)
482         createddate = datetime.datetime.now() \
483             .strftime("%a, %e %b %Y %T -0000")
484         try:
485             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
486                 .strftime("%a, %e %b %Y %T -0000")
487         except:
488             pass
489         msg.add_header("Date", createddate)
490         msg.add_header("Subject", item["title"])
491         msg.set_default_type("text/plain")
492
493         htmlcontent = content.encode("utf-8")
494         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
495             content, \
496             item["link"], \
497             item["link"] )
498         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
499         textparser = HTML2Text()
500         textparser.feed(content.encode("utf-8"))
501         textcontent = textparser.gettext()
502         textcontent = "%s\n\nItem URL: %s" %( \
503             textcontent, \
504             item["link"] )
505         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
506         msg.attach(textpart)
507         msg.attach(htmlpart)
508
509         # start by working out the filename we should be writting to, we do
510         # this following the normal maildir style rules
511         fname = str(os.getpid()) \
512             + "." + socket.gethostname() \
513             + "." + "".join( \
514                 [random.choice( \
515                     string.ascii_letters + string.digits \
516                     ) for a in range(0,10) \
517                 ]) + "." \
518             + datetime.datetime.now().strftime('%s')
519         fn = os.path.join(maildir, "tmp", fname)
520         fh = open(fn, "w")
521         fh.write(msg.as_string())
522         fh.close()
523         # now move it in to the new directory
524         newfn = os.path.join(maildir, "new", fname)
525         os.link(fn, newfn)
526         os.unlink(fn)
527
528         # now add to the database about the item
529         if prevmessageid:
530             messageid = prevmessageid + " " + messageid
531         if item.has_key("guid") and item["guid"] != item["link"]:
532             data = urllib.urlencode(( \
533                 ("message-id", messageid), \
534                 ("created", createddate), \
535                 ("contentmd5", md5sum) \
536                 ))
537             db[url + "|" + item["guid"]] = data
538             try:
539                 data = db[url + "|" + item["link"]]
540                 data = cgi.parse_qs(data)
541                 newdata = urllib.urlencode(( \
542                     ("message-id", messageid), \
543                     ("created", data["created"][0]), \
544                     ("contentmd5", data["contentmd5"][0]) \
545                     ))
546                 db[url + "|" + item["link"]] = newdata
547             except:
548                 db[url + "|" + item["link"]] = data
549         else:
550             data = urllib.urlencode(( \
551                 ("message-id", messageid), \
552                 ("created", createddate), \
553                 ("contentmd5", md5sum) \
554                 ))
555             db[url + "|" + item["link"]] = data
556
557     if headers:
558         data = []
559         for header in headers:
560             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
561                 data.append((header[0], header[1]))
562         if len(data) > 0:
563             data = urllib.urlencode(data)
564             feeddb[url] = data
565
566     db.close()
567     feeddb.close()
568
569 if __name__ == "__main__":
570     # This only gets executed if we really called the program
571     # first off, parse the command line arguments
572
573     oparser = OptionParser()
574     oparser.add_option(
575         "-c", "--conf", dest="conf",
576         help="location of config file"
577         )
578     oparser.add_option(
579         "-s", "--statedir", dest="statedir",
580         help="location of directory to store state in"
581         )
582
583     (options, args) = oparser.parse_args()
584
585     # check for the configfile
586
587     configfile = None
588
589     if options.conf != None:
590         # does the file exist?
591         try:
592             os.stat(options.conf)
593             configfile = options.conf
594         except:
595             # should exit here as the specified file doesn't exist
596             sys.stderr.write( \
597                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
598             sys.exit(2)
599     else:
600         # check through the default locations
601         try:
602             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
603             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
604         except:
605             try:
606                 os.stat("/etc/rss2maildir.conf")
607                 configfile = "/etc/rss2maildir.conf"
608             except:
609                 sys.stderr.write("No config file found. Exiting.\n")
610                 sys.exit(2)
611
612     # Right - if we've got this far, we've got a config file, now for the hard
613     # bits...
614
615     scp = SafeConfigParser()
616     scp.read(configfile)
617
618     maildir_root = "RSSMaildir"
619     state_dir = "state"
620
621     if options.statedir != None:
622         state_dir = options.statedir
623         try:
624             mode = os.stat(state_dir)[stat.ST_MODE]
625             if not stat.S_ISDIR(mode):
626                 sys.stderr.write( \
627                     "State directory (%s) is not a directory\n" %(state_dir))
628                 sys.exit(1)
629         except:
630             # try to make the directory
631             try:
632                 os.mkdir(state_dir)
633             except:
634                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
635                 sys.exit(1)
636     elif scp.has_option("general", "state_dir"):
637         new_state_dir = scp.get("general", "state_dir")
638         try:
639             mode = os.stat(state_dir)[stat.ST_MODE]
640             if not stat.S_ISDIR(mode):
641                 sys.stderr.write( \
642                     "State directory (%s) is not a directory\n" %(state_dir))
643                 sys.exit(1)
644         except:
645             # try to create it
646             try:
647                 os.mkdir(new_state_dir)
648                 state_dir = new_state_dir
649             except:
650                 sys.stderr.write( \
651                     "Couldn't create state directory %s\n" %(new_state_dir))
652                 sys.exit(1)
653     else:
654         try:
655             mode = os.stat(state_dir)[stat.ST_MODE]
656             if not stat.S_ISDIR(mode):
657                 sys.stderr.write( \
658                     "State directory %s is not a directory\n" %(state_dir))
659                 sys.exit(1)
660         except:
661             try:
662                 os.mkdir(state_dir)
663             except:
664                 sys.stderr.write( \
665                     "State directory %s could not be created\n" %(state_dir))
666                 sys.exit(1)
667
668     if scp.has_option("general", "maildir_root"):
669         maildir_root = scp.get("general", "maildir_root")
670
671     try:
672         mode = os.stat(maildir_root)[stat.ST_MODE]
673         if not stat.S_ISDIR(mode):
674             sys.stderr.write( \
675                 "Maildir Root %s is not a directory\n" \
676                 %(maildir_root))
677             sys.exit(1)
678     except:
679         try:
680             os.mkdir(maildir_root)
681         except:
682             sys.stderr.write("Couldn't create Maildir Root %s\n" \
683                 %(maildir_root))
684             sys.exit(1)
685
686     feeds = scp.sections()
687     try:
688         feeds.remove("general")
689     except:
690         pass
691
692     for section in feeds:
693         # check if the directory exists
694         maildir = None
695         try:
696             maildir = scp.get(section, "maildir")
697         except:
698             maildir = section
699
700         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
701         maildir = os.path.join(maildir_root, maildir)
702
703         try:
704             exists = os.stat(maildir)
705             if stat.S_ISDIR(exists[stat.ST_MODE]):
706                 # check if there's a new, cur and tmp directory
707                 try:
708                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
709                 except:
710                     os.mkdir(os.path.join(maildir, "cur"))
711                     if not stat.S_ISDIR(mode):
712                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
713                 try:
714                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
715                 except:
716                     os.mkdir(os.path.join(maildir, "tmp"))
717                     if not stat.S_ISDIR(mode):
718                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
719                 try:
720                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
721                     if not stat.S_ISDIR(mode):
722                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
723                 except:
724                     os.mkdir(os.path.join(maildir, "new"))
725             else:
726                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
727         except:
728             try:
729                 os.mkdir(maildir)
730             except:
731                 sys.stderr.write("Couldn't create root maildir %s\n" \
732                     %(maildir))
733                 sys.exit(1)
734             try:
735                 os.mkdir(os.path.join(maildir, "new"))
736                 os.mkdir(os.path.join(maildir, "cur"))
737                 os.mkdir(os.path.join(maildir, "tmp"))
738             except:
739                 sys.stderr.write( \
740                     "Couldn't create required maildir directories for %s\n" \
741                     %(section,))
742                 sys.exit(1)
743
744         # right - we've got the directories, we've got the section, we know the
745         # url... lets play!
746
747         parse_and_deliver(maildir, section, state_dir)