* Add unittest for unordered list
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             self.handle_br()
92         elif tag.lower() == "blockquote":
93             self.inblockquote = True
94             self.text = self.text + u'\n'
95         elif tag.lower() == "p":
96             if self.text != "":
97                 self.text = self.text + u'\n\n'
98             if self.inparagraph:
99                 self.text = self.text \
100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101             self.currentparagraph = u''
102             self.inparagraph = True
103         elif tag.lower() == "pre":
104             self.text = self.text + "\n"
105             self.inpre = True
106             self.inparagraph = False
107             self.inblockquote = False
108         elif tag.lower() == "ul":
109             self.item = u''
110             self.inul = True
111             self.text = self.text + "\n"
112         elif tag.lower() == "li" and self.inul:
113             if not self.initem:
114                 self.initem = True
115                 self.item = u''
116             else:
117                 self.text = self.text \
118                     + u' * ' \
119                     + u'\n   '.join([a.strip() for a in \
120                         textwrap.wrap(self.item, 67)]) \
121                     + u'\n'
122                 self.item = u''
123
124     def handle_startendtag(self, tag, attrs):
125         if tag.lower() == "br":
126             self.handle_br()
127
128     def handle_br(self):
129             if self.inparagraph:
130                 self.text = self.text \
131                 + u'\n'.join( \
132                     [a \
133                         for a in textwrap.wrap( \
134                             self.currentparagraph, 70) \
135                     ] \
136                 ) \
137                 + u'\n'
138                 self.currentparagraph = u''
139             elif self.inblockquote:
140                 self.text = self.text \
141                     + u'\n> ' \
142                     + u'\n> '.join( \
143                         [a \
144                             for a in textwrap.wrap( \
145                                 self.blockquote.encode("utf-8") \
146                                 , 68) \
147                         ] \
148                     ) \
149                     + u'\n'
150                 self.blockquote = u''
151             else:
152                 self.text = self.text + "\n"
153
154     def handle_endtag(self, tag):
155         if tag.lower() == "h1":
156             self.inheadingone = False
157             self.text = self.text \
158                 + u'\n\n' \
159                 + self.headingtext.encode("utf-8") \
160                 + u'\n' \
161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
162             self.headingtext = u''
163         elif tag.lower() == "h2":
164             self.inheadingtwo = False
165             self.text = self.text \
166                 + u'\n\n' \
167                 + self.headingtext.encode("utf-8") \
168                 + u'\n' \
169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
170             self.headingtext = u''
171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
172             self.inotherheading = False
173             self.text = self.text \
174                 + u'\n\n' \
175                 + self.headingtext.encode("utf-8") \
176                 + u'\n' \
177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
178             self.headingtext = u''
179         elif tag.lower() == "p":
180             self.text = self.text \
181                 + u'\n'.join(textwrap.wrap( \
182                     self.currentparagraph, 70) \
183                 )
184             self.inparagraph = False
185             self.currentparagraph = u''
186         elif tag.lower() == "blockquote":
187             self.text = self.text \
188                 + u'\n> ' \
189                 + u'\n> '.join( \
190                     [a.strip() \
191                         for a in textwrap.wrap( \
192                             self.blockquote, 68)] \
193                     ) \
194                 + u'\n'
195             self.inblockquote = False
196             self.blockquote = u''
197         elif tag.lower() == "pre":
198             self.inpre = False
199         elif tag.lower() == "li":
200             self.initem = False
201             if self.item != "":
202                 self.text = self.text \
203                     + u' * ' \
204                     + u'\n   '.join( \
205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
206                     + u'\n'
207             self.item = u''
208         elif tag.lower() == "ul":
209             self.inul = False
210
211     def handle_data(self, data):
212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
213             self.headingtext = self.headingtext \
214                 + unicode(data, "utf-8").strip() \
215                 + u' '
216         elif self.inblockquote:
217             self.blockquote = self.blockquote \
218                 + unicode(data, "utf-8").strip() \
219                 + u' '
220         elif self.inparagraph:
221             self.currentparagraph = self.currentparagraph \
222                 + unicode(data, "utf-8").strip() \
223                 + u' '
224         elif self.inul and self.initem:
225             self.item = self.item + unicode(data, "utf-8")
226         elif self.inpre:
227             self.text = self.text + unicode(data, "utf-8")
228         else:
229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
230
231     def handle_entityref(self, name):
232         entity = name
233         if HTML2Text.entities.has_key(name.lower()):
234             entity = HTML2Text.entities[name.lower()]
235         elif name[0] == "#":
236             entity = unichr(int(name[1:]))
237         else:
238             entity = "&" + name + ";"
239
240         if self.inparagraph:
241             self.currentparagraph = self.currentparagraph \
242                 + unicode(entity, "utf-8")
243         elif self.inblockquote:
244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
245         else:
246             self.text = self.text + unicode(entity, "utf-8")
247
248     def gettext(self):
249         data = self.text
250         if self.inparagraph:
251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
252         if data[-1] != '\n':
253             data = data + '\n'
254         return data
255
256 def open_url(method, url):
257     redirectcount = 0
258     while redirectcount < 3:
259         (type, rest) = urllib.splittype(url)
260         (host, path) = urllib.splithost(rest)
261         (host, port) = urllib.splitport(host)
262         if port == None:
263             port = 80
264         try:
265             conn = httplib.HTTPConnection("%s:%s" %(host, port))
266             conn.request(method, path)
267             response = conn.getresponse()
268             if response.status in [301, 302, 303, 307]:
269                 headers = response.getheaders()
270                 for header in headers:
271                     if header[0] == "location":
272                         url = header[1]
273             elif response.status == 200:
274                 return response
275         except:
276             pass
277         redirectcount = redirectcount + 1
278     return None
279
280 def parse_and_deliver(maildir, url, statedir):
281     feedhandle = None
282     headers = None
283     # first check if we know about this feed already
284     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
285     if feeddb.has_key(url):
286         data = feeddb[url]
287         data = cgi.parse_qs(data)
288         response = open_url("HEAD", url)
289         headers = None
290         if response:
291             headers = response.getheaders()
292         ischanged = False
293         try:
294             for header in headers:
295                 if header[0] == "content-length":
296                     if header[1] != data["content-length"][0]:
297                         ischanged = True
298                 elif header[0] == "etag":
299                     if header[1] != data["etag"][0]:
300                         ischanged = True
301                 elif header[0] == "last-modified":
302                     if header[1] != data["last-modified"][0]:
303                         ischanged = True
304                 elif header[0] == "content-md5":
305                     if header[1] != data["content-md5"][0]:
306                         ischanged = True
307         except:
308             ischanged = True
309         if ischanged:
310             response = open_url("GET", url)
311             if response != None:
312                 headers = response.getheaders()
313                 feedhandle = response
314             else:
315                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
316                 return
317         else:
318             return # don't need to do anything, nothings changed.
319     else:
320         response = open_url("GET", url)
321         if response != None:
322             headers = response.getheaders()
323             feedhandle = response
324         else:
325             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
326             return
327
328     fp = feedparser.parse(feedhandle)
329     db = dbm.open(os.path.join(statedir, "seen"), "c")
330     for item in fp["items"]:
331         # have we seen it before?
332         # need to work out what the content is first...
333
334         if item.has_key("content"):
335             content = item["content"][0]["value"]
336         else:
337             content = item["summary"]
338
339         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
340
341         prevmessageid = None
342
343         # check if there's a guid too - if that exists and we match the md5,
344         # return
345         if item.has_key("guid"):
346             if db.has_key(url + "|" + item["guid"]):
347                 data = db[url + "|" + item["guid"]]
348                 data = cgi.parse_qs(data)
349                 if data["contentmd5"][0] == md5sum:
350                     continue
351
352         if db.has_key(url + "|" + item["link"]):
353             data = db[url + "|" + item["link"]]
354             data = cgi.parse_qs(data)
355             if data.has_key("message-id"):
356                 prevmessageid = data["message-id"][0]
357             if data["contentmd5"][0] == md5sum:
358                 continue
359
360         try:
361             author = item["author"]
362         except:
363             author = url
364
365         # create a basic email message
366         msg = MIMEMultipart("alternative")
367         messageid = "<" \
368             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
369             + "." \
370             + "".join( \
371                 [random.choice( \
372                     string.ascii_letters + string.digits \
373                     ) for a in range(0,6) \
374                 ]) + "@" + socket.gethostname() + ">"
375         msg.add_header("Message-ID", messageid)
376         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
377         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
378         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
379         if prevmessageid:
380             msg.add_header("References", prevmessageid)
381         createddate = datetime.datetime.now() \
382             .strftime("%a, %e %b %Y %T -0000")
383         try:
384             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
385                 .strftime("%a, %e %b %Y %T -0000")
386         except:
387             pass
388         msg.add_header("Date", createddate)
389         msg.add_header("Subject", item["title"])
390         msg.set_default_type("text/plain")
391
392         htmlcontent = content.encode("utf-8")
393         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
394             content, \
395             item["link"], \
396             item["link"] )
397         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
398         textparser = HTML2Text()
399         textparser.feed(content.encode("utf-8"))
400         textcontent = textparser.gettext()
401         textcontent = "%s\n\nItem URL: %s" %( \
402             textcontent, \
403             item["link"] )
404         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
405         msg.attach(textpart)
406         msg.attach(htmlpart)
407
408         # start by working out the filename we should be writting to, we do
409         # this following the normal maildir style rules
410         fname = str(os.getpid()) \
411             + "." + socket.gethostname() \
412             + "." + "".join( \
413                 [random.choice( \
414                     string.ascii_letters + string.digits \
415                     ) for a in range(0,10) \
416                 ]) + "." \
417             + datetime.datetime.now().strftime('%s')
418         fn = os.path.join(maildir, "tmp", fname)
419         fh = open(fn, "w")
420         fh.write(msg.as_string())
421         fh.close()
422         # now move it in to the new directory
423         newfn = os.path.join(maildir, "new", fname)
424         os.link(fn, newfn)
425         os.unlink(fn)
426
427         # now add to the database about the item
428         if prevmessageid:
429             messageid = prevmessageid + " " + messageid
430         if item.has_key("guid") and item["guid"] != item["link"]:
431             data = urllib.urlencode(( \
432                 ("message-id", messageid), \
433                 ("created", createddate), \
434                 ("contentmd5", md5sum) \
435                 ))
436             db[url + "|" + item["guid"]] = data
437             try:
438                 data = db[url + "|" + item["link"]]
439                 data = cgi.parse_qs(data)
440                 newdata = urllib.urlencode(( \
441                     ("message-id", messageid), \
442                     ("created", data["created"][0]), \
443                     ("contentmd5", data["contentmd5"][0]) \
444                     ))
445                 db[url + "|" + item["link"]] = newdata
446             except:
447                 db[url + "|" + item["link"]] = data
448         else:
449             data = urllib.urlencode(( \
450                 ("message-id", messageid), \
451                 ("created", createddate), \
452                 ("contentmd5", md5sum) \
453                 ))
454             db[url + "|" + item["link"]] = data
455
456     if headers:
457         data = []
458         for header in headers:
459             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
460                 data.append((header[0], header[1]))
461         if len(data) > 0:
462             data = urllib.urlencode(data)
463             feeddb[url] = data
464
465     db.close()
466     feeddb.close()
467
468 if __name__ == "__main__":
469     # This only gets executed if we really called the program
470     # first off, parse the command line arguments
471
472     oparser = OptionParser()
473     oparser.add_option(
474         "-c", "--conf", dest="conf",
475         help="location of config file"
476         )
477     oparser.add_option(
478         "-s", "--statedir", dest="statedir",
479         help="location of directory to store state in"
480         )
481
482     (options, args) = oparser.parse_args()
483
484     # check for the configfile
485
486     configfile = None
487
488     if options.conf != None:
489         # does the file exist?
490         try:
491             os.stat(options.conf)
492             configfile = options.conf
493         except:
494             # should exit here as the specified file doesn't exist
495             sys.stderr.write( \
496                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
497             sys.exit(2)
498     else:
499         # check through the default locations
500         try:
501             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
502             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
503         except:
504             try:
505                 os.stat("/etc/rss2maildir.conf")
506                 configfile = "/etc/rss2maildir.conf"
507             except:
508                 sys.stderr.write("No config file found. Exiting.\n")
509                 sys.exit(2)
510
511     # Right - if we've got this far, we've got a config file, now for the hard
512     # bits...
513
514     scp = SafeConfigParser()
515     scp.read(configfile)
516
517     maildir_root = "RSSMaildir"
518     state_dir = "state"
519
520     if options.statedir != None:
521         state_dir = options.statedir
522         try:
523             mode = os.stat(state_dir)[stat.ST_MODE]
524             if not stat.S_ISDIR(mode):
525                 sys.stderr.write( \
526                     "State directory (%s) is not a directory\n" %(state_dir))
527                 sys.exit(1)
528         except:
529             # try to make the directory
530             try:
531                 os.mkdir(state_dir)
532             except:
533                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
534                 sys.exit(1)
535     elif scp.has_option("general", "state_dir"):
536         new_state_dir = scp.get("general", "state_dir")
537         try:
538             mode = os.stat(state_dir)[stat.ST_MODE]
539             if not stat.S_ISDIR(mode):
540                 sys.stderr.write( \
541                     "State directory (%s) is not a directory\n" %(state_dir))
542                 sys.exit(1)
543         except:
544             # try to create it
545             try:
546                 os.mkdir(new_state_dir)
547                 state_dir = new_state_dir
548             except:
549                 sys.stderr.write( \
550                     "Couldn't create state directory %s\n" %(new_state_dir))
551                 sys.exit(1)
552     else:
553         try:
554             mode = os.stat(state_dir)[stat.ST_MODE]
555             if not stat.S_ISDIR(mode):
556                 sys.stderr.write( \
557                     "State directory %s is not a directory\n" %(state_dir))
558                 sys.exit(1)
559         except:
560             try:
561                 os.mkdir(state_dir)
562             except:
563                 sys.stderr.write( \
564                     "State directory %s could not be created\n" %(state_dir))
565                 sys.exit(1)
566
567     if scp.has_option("general", "maildir_root"):
568         maildir_root = scp.get("general", "maildir_root")
569
570     try:
571         mode = os.stat(maildir_root)[stat.ST_MODE]
572         if not stat.S_ISDIR(mode):
573             sys.stderr.write( \
574                 "Maildir Root %s is not a directory\n" \
575                 %(maildir_root))
576             sys.exit(1)
577     except:
578         try:
579             os.mkdir(maildir_root)
580         except:
581             sys.stderr.write("Couldn't create Maildir Root %s\n" \
582                 %(maildir_root))
583             sys.exit(1)
584
585     feeds = scp.sections()
586     try:
587         feeds.remove("general")
588     except:
589         pass
590
591     for section in feeds:
592         # check if the directory exists
593         maildir = None
594         try:
595             maildir = scp.get(section, "maildir")
596         except:
597             maildir = section
598
599         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
600         maildir = os.path.join(maildir_root, maildir)
601
602         try:
603             exists = os.stat(maildir)
604             if stat.S_ISDIR(exists[stat.ST_MODE]):
605                 # check if there's a new, cur and tmp directory
606                 try:
607                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
608                 except:
609                     os.mkdir(os.path.join(maildir, "cur"))
610                     if not stat.S_ISDIR(mode):
611                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
612                 try:
613                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
614                 except:
615                     os.mkdir(os.path.join(maildir, "tmp"))
616                     if not stat.S_ISDIR(mode):
617                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
618                 try:
619                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
620                     if not stat.S_ISDIR(mode):
621                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
622                 except:
623                     os.mkdir(os.path.join(maildir, "new"))
624             else:
625                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
626         except:
627             try:
628                 os.mkdir(maildir)
629             except:
630                 sys.stderr.write("Couldn't create root maildir %s\n" \
631                     %(maildir))
632                 sys.exit(1)
633             try:
634                 os.mkdir(os.path.join(maildir, "new"))
635                 os.mkdir(os.path.join(maildir, "cur"))
636                 os.mkdir(os.path.join(maildir, "tmp"))
637             except:
638                 sys.stderr.write( \
639                     "Couldn't create required maildir directories for %s\n" \
640                     %(section,))
641                 sys.exit(1)
642
643         # right - we've got the directories, we've got the section, we know the
644         # url... lets play!
645
646         parse_and_deliver(maildir, section, state_dir)