Update li handling a bit, and make the expected test results be what we'd
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             self.handle_br()
92         elif tag.lower() == "blockquote":
93             self.inblockquote = True
94             self.text = self.text + u'\n'
95         elif tag.lower() == "p":
96             if self.text != "":
97                 self.text = self.text + u'\n\n'
98             if self.inparagraph:
99                 self.text = self.text \
100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101             self.currentparagraph = u''
102             self.inparagraph = True
103         elif tag.lower() == "pre":
104             self.text = self.text + "\n"
105             self.inpre = True
106             self.inparagraph = False
107             self.inblockquote = False
108         elif tag.lower() == "ul":
109             self.item = u''
110             self.inul = True
111             self.text = self.text + "\n"
112         elif tag.lower() == "li":
113             if not self.initem:
114                 self.initem = True
115                 self.item = u''
116             else:
117                 self.text = self.text \
118                     + u' * ' \
119                     + u'\n   '.join([a.strip() for a in \
120                         textwrap.wrap(self.item, 67)]) \
121                     + u'\n'
122                 self.item = u''
123                 self.initem = True
124
125     def handle_startendtag(self, tag, attrs):
126         if tag.lower() == "br":
127             self.handle_br()
128
129     def handle_br(self):
130             if self.inparagraph:
131                 self.text = self.text \
132                 + u'\n'.join( \
133                     [a \
134                         for a in textwrap.wrap( \
135                             self.currentparagraph, 70) \
136                     ] \
137                 ) \
138                 + u'\n'
139                 self.currentparagraph = u''
140             elif self.inblockquote:
141                 self.text = self.text \
142                     + u'\n> ' \
143                     + u'\n> '.join( \
144                         [a \
145                             for a in textwrap.wrap( \
146                                 self.blockquote.encode("utf-8") \
147                                 , 68) \
148                         ] \
149                     ) \
150                     + u'\n'
151                 self.blockquote = u''
152             else:
153                 self.text = self.text + "\n"
154
155     def handle_endtag(self, tag):
156         if tag.lower() == "h1":
157             self.inheadingone = False
158             self.text = self.text \
159                 + u'\n\n' \
160                 + self.headingtext.encode("utf-8") \
161                 + u'\n' \
162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
163             self.headingtext = u''
164         elif tag.lower() == "h2":
165             self.inheadingtwo = False
166             self.text = self.text \
167                 + u'\n\n' \
168                 + self.headingtext.encode("utf-8") \
169                 + u'\n' \
170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
171             self.headingtext = u''
172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173             self.inotherheading = False
174             self.text = self.text \
175                 + u'\n\n' \
176                 + self.headingtext.encode("utf-8") \
177                 + u'\n' \
178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
179             self.headingtext = u''
180         elif tag.lower() == "p":
181             self.text = self.text \
182                 + u'\n'.join(textwrap.wrap( \
183                     self.currentparagraph, 70) \
184                 )
185             self.inparagraph = False
186             self.currentparagraph = u''
187         elif tag.lower() == "blockquote":
188             self.text = self.text \
189                 + u'\n> ' \
190                 + u'\n> '.join( \
191                     [a.strip() \
192                         for a in textwrap.wrap( \
193                             self.blockquote, 68)] \
194                     ) \
195                 + u'\n'
196             self.inblockquote = False
197             self.blockquote = u''
198         elif tag.lower() == "pre":
199             self.inpre = False
200         elif tag.lower() == "li":
201             self.initem = False
202             if self.item != u'':
203                 self.text = self.text \
204                     + u' * ' \
205                     + u'\n   '.join( \
206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
207                     + u'\n'
208             self.item = u''
209         elif tag.lower() == "ul":
210             self.inul = False
211
212     def handle_data(self, data):
213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
214             self.headingtext = self.headingtext \
215                 + unicode(data, "utf-8").strip() \
216                 + u' '
217         elif self.inblockquote:
218             self.blockquote = self.blockquote \
219                 + unicode(data, "utf-8").strip() \
220                 + u' '
221         elif self.initem:
222             self.item = self.item + unicode(data, "utf-8")
223         elif self.inparagraph:
224             self.currentparagraph = self.currentparagraph \
225                 + unicode(data, "utf-8").strip() \
226                 + u' '
227         elif self.inpre:
228             self.text = self.text + unicode(data, "utf-8")
229         else:
230             isallwhitespace = data.strip()
231             if isallwhitespace != "" and self.text[-1] == "\n":
232                 self.text = self.text + unicode(data, "utf-8").strip() + u' '
233
234     def handle_entityref(self, name):
235         entity = name
236         if HTML2Text.entities.has_key(name.lower()):
237             entity = HTML2Text.entities[name.lower()]
238         elif name[0] == "#":
239             entity = unichr(int(name[1:]))
240         else:
241             entity = "&" + name + ";"
242
243         if self.inparagraph:
244             self.currentparagraph = self.currentparagraph \
245                 + unicode(entity, "utf-8")
246         elif self.inblockquote:
247             self.blockquote = self.blockquote + unicode(entity, "utf-8")
248         else:
249             self.text = self.text + unicode(entity, "utf-8")
250
251     def gettext(self):
252         data = self.text
253         if self.inparagraph:
254             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
255         if data[-1] != '\n':
256             data = data + '\n'
257         return data
258
259 def open_url(method, url):
260     redirectcount = 0
261     while redirectcount < 3:
262         (type, rest) = urllib.splittype(url)
263         (host, path) = urllib.splithost(rest)
264         (host, port) = urllib.splitport(host)
265         if port == None:
266             port = 80
267         try:
268             conn = httplib.HTTPConnection("%s:%s" %(host, port))
269             conn.request(method, path)
270             response = conn.getresponse()
271             if response.status in [301, 302, 303, 307]:
272                 headers = response.getheaders()
273                 for header in headers:
274                     if header[0] == "location":
275                         url = header[1]
276             elif response.status == 200:
277                 return response
278         except:
279             pass
280         redirectcount = redirectcount + 1
281     return None
282
283 def parse_and_deliver(maildir, url, statedir):
284     feedhandle = None
285     headers = None
286     # first check if we know about this feed already
287     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
288     if feeddb.has_key(url):
289         data = feeddb[url]
290         data = cgi.parse_qs(data)
291         response = open_url("HEAD", url)
292         headers = None
293         if response:
294             headers = response.getheaders()
295         ischanged = False
296         try:
297             for header in headers:
298                 if header[0] == "content-length":
299                     if header[1] != data["content-length"][0]:
300                         ischanged = True
301                 elif header[0] == "etag":
302                     if header[1] != data["etag"][0]:
303                         ischanged = True
304                 elif header[0] == "last-modified":
305                     if header[1] != data["last-modified"][0]:
306                         ischanged = True
307                 elif header[0] == "content-md5":
308                     if header[1] != data["content-md5"][0]:
309                         ischanged = True
310         except:
311             ischanged = True
312         if ischanged:
313             response = open_url("GET", url)
314             if response != None:
315                 headers = response.getheaders()
316                 feedhandle = response
317             else:
318                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
319                 return
320         else:
321             return # don't need to do anything, nothings changed.
322     else:
323         response = open_url("GET", url)
324         if response != None:
325             headers = response.getheaders()
326             feedhandle = response
327         else:
328             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
329             return
330
331     fp = feedparser.parse(feedhandle)
332     db = dbm.open(os.path.join(statedir, "seen"), "c")
333     for item in fp["items"]:
334         # have we seen it before?
335         # need to work out what the content is first...
336
337         if item.has_key("content"):
338             content = item["content"][0]["value"]
339         else:
340             content = item["summary"]
341
342         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
343
344         prevmessageid = None
345
346         # check if there's a guid too - if that exists and we match the md5,
347         # return
348         if item.has_key("guid"):
349             if db.has_key(url + "|" + item["guid"]):
350                 data = db[url + "|" + item["guid"]]
351                 data = cgi.parse_qs(data)
352                 if data["contentmd5"][0] == md5sum:
353                     continue
354
355         if db.has_key(url + "|" + item["link"]):
356             data = db[url + "|" + item["link"]]
357             data = cgi.parse_qs(data)
358             if data.has_key("message-id"):
359                 prevmessageid = data["message-id"][0]
360             if data["contentmd5"][0] == md5sum:
361                 continue
362
363         try:
364             author = item["author"]
365         except:
366             author = url
367
368         # create a basic email message
369         msg = MIMEMultipart("alternative")
370         messageid = "<" \
371             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
372             + "." \
373             + "".join( \
374                 [random.choice( \
375                     string.ascii_letters + string.digits \
376                     ) for a in range(0,6) \
377                 ]) + "@" + socket.gethostname() + ">"
378         msg.add_header("Message-ID", messageid)
379         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
380         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
381         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
382         if prevmessageid:
383             msg.add_header("References", prevmessageid)
384         createddate = datetime.datetime.now() \
385             .strftime("%a, %e %b %Y %T -0000")
386         try:
387             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
388                 .strftime("%a, %e %b %Y %T -0000")
389         except:
390             pass
391         msg.add_header("Date", createddate)
392         msg.add_header("Subject", item["title"])
393         msg.set_default_type("text/plain")
394
395         htmlcontent = content.encode("utf-8")
396         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
397             content, \
398             item["link"], \
399             item["link"] )
400         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
401         textparser = HTML2Text()
402         textparser.feed(content.encode("utf-8"))
403         textcontent = textparser.gettext()
404         textcontent = "%s\n\nItem URL: %s" %( \
405             textcontent, \
406             item["link"] )
407         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
408         msg.attach(textpart)
409         msg.attach(htmlpart)
410
411         # start by working out the filename we should be writting to, we do
412         # this following the normal maildir style rules
413         fname = str(os.getpid()) \
414             + "." + socket.gethostname() \
415             + "." + "".join( \
416                 [random.choice( \
417                     string.ascii_letters + string.digits \
418                     ) for a in range(0,10) \
419                 ]) + "." \
420             + datetime.datetime.now().strftime('%s')
421         fn = os.path.join(maildir, "tmp", fname)
422         fh = open(fn, "w")
423         fh.write(msg.as_string())
424         fh.close()
425         # now move it in to the new directory
426         newfn = os.path.join(maildir, "new", fname)
427         os.link(fn, newfn)
428         os.unlink(fn)
429
430         # now add to the database about the item
431         if prevmessageid:
432             messageid = prevmessageid + " " + messageid
433         if item.has_key("guid") and item["guid"] != item["link"]:
434             data = urllib.urlencode(( \
435                 ("message-id", messageid), \
436                 ("created", createddate), \
437                 ("contentmd5", md5sum) \
438                 ))
439             db[url + "|" + item["guid"]] = data
440             try:
441                 data = db[url + "|" + item["link"]]
442                 data = cgi.parse_qs(data)
443                 newdata = urllib.urlencode(( \
444                     ("message-id", messageid), \
445                     ("created", data["created"][0]), \
446                     ("contentmd5", data["contentmd5"][0]) \
447                     ))
448                 db[url + "|" + item["link"]] = newdata
449             except:
450                 db[url + "|" + item["link"]] = data
451         else:
452             data = urllib.urlencode(( \
453                 ("message-id", messageid), \
454                 ("created", createddate), \
455                 ("contentmd5", md5sum) \
456                 ))
457             db[url + "|" + item["link"]] = data
458
459     if headers:
460         data = []
461         for header in headers:
462             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
463                 data.append((header[0], header[1]))
464         if len(data) > 0:
465             data = urllib.urlencode(data)
466             feeddb[url] = data
467
468     db.close()
469     feeddb.close()
470
471 if __name__ == "__main__":
472     # This only gets executed if we really called the program
473     # first off, parse the command line arguments
474
475     oparser = OptionParser()
476     oparser.add_option(
477         "-c", "--conf", dest="conf",
478         help="location of config file"
479         )
480     oparser.add_option(
481         "-s", "--statedir", dest="statedir",
482         help="location of directory to store state in"
483         )
484
485     (options, args) = oparser.parse_args()
486
487     # check for the configfile
488
489     configfile = None
490
491     if options.conf != None:
492         # does the file exist?
493         try:
494             os.stat(options.conf)
495             configfile = options.conf
496         except:
497             # should exit here as the specified file doesn't exist
498             sys.stderr.write( \
499                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
500             sys.exit(2)
501     else:
502         # check through the default locations
503         try:
504             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
505             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
506         except:
507             try:
508                 os.stat("/etc/rss2maildir.conf")
509                 configfile = "/etc/rss2maildir.conf"
510             except:
511                 sys.stderr.write("No config file found. Exiting.\n")
512                 sys.exit(2)
513
514     # Right - if we've got this far, we've got a config file, now for the hard
515     # bits...
516
517     scp = SafeConfigParser()
518     scp.read(configfile)
519
520     maildir_root = "RSSMaildir"
521     state_dir = "state"
522
523     if options.statedir != None:
524         state_dir = options.statedir
525         try:
526             mode = os.stat(state_dir)[stat.ST_MODE]
527             if not stat.S_ISDIR(mode):
528                 sys.stderr.write( \
529                     "State directory (%s) is not a directory\n" %(state_dir))
530                 sys.exit(1)
531         except:
532             # try to make the directory
533             try:
534                 os.mkdir(state_dir)
535             except:
536                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
537                 sys.exit(1)
538     elif scp.has_option("general", "state_dir"):
539         new_state_dir = scp.get("general", "state_dir")
540         try:
541             mode = os.stat(state_dir)[stat.ST_MODE]
542             if not stat.S_ISDIR(mode):
543                 sys.stderr.write( \
544                     "State directory (%s) is not a directory\n" %(state_dir))
545                 sys.exit(1)
546         except:
547             # try to create it
548             try:
549                 os.mkdir(new_state_dir)
550                 state_dir = new_state_dir
551             except:
552                 sys.stderr.write( \
553                     "Couldn't create state directory %s\n" %(new_state_dir))
554                 sys.exit(1)
555     else:
556         try:
557             mode = os.stat(state_dir)[stat.ST_MODE]
558             if not stat.S_ISDIR(mode):
559                 sys.stderr.write( \
560                     "State directory %s is not a directory\n" %(state_dir))
561                 sys.exit(1)
562         except:
563             try:
564                 os.mkdir(state_dir)
565             except:
566                 sys.stderr.write( \
567                     "State directory %s could not be created\n" %(state_dir))
568                 sys.exit(1)
569
570     if scp.has_option("general", "maildir_root"):
571         maildir_root = scp.get("general", "maildir_root")
572
573     try:
574         mode = os.stat(maildir_root)[stat.ST_MODE]
575         if not stat.S_ISDIR(mode):
576             sys.stderr.write( \
577                 "Maildir Root %s is not a directory\n" \
578                 %(maildir_root))
579             sys.exit(1)
580     except:
581         try:
582             os.mkdir(maildir_root)
583         except:
584             sys.stderr.write("Couldn't create Maildir Root %s\n" \
585                 %(maildir_root))
586             sys.exit(1)
587
588     feeds = scp.sections()
589     try:
590         feeds.remove("general")
591     except:
592         pass
593
594     for section in feeds:
595         # check if the directory exists
596         maildir = None
597         try:
598             maildir = scp.get(section, "maildir")
599         except:
600             maildir = section
601
602         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
603         maildir = os.path.join(maildir_root, maildir)
604
605         try:
606             exists = os.stat(maildir)
607             if stat.S_ISDIR(exists[stat.ST_MODE]):
608                 # check if there's a new, cur and tmp directory
609                 try:
610                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
611                 except:
612                     os.mkdir(os.path.join(maildir, "cur"))
613                     if not stat.S_ISDIR(mode):
614                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
615                 try:
616                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
617                 except:
618                     os.mkdir(os.path.join(maildir, "tmp"))
619                     if not stat.S_ISDIR(mode):
620                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
621                 try:
622                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
623                     if not stat.S_ISDIR(mode):
624                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
625                 except:
626                     os.mkdir(os.path.join(maildir, "new"))
627             else:
628                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
629         except:
630             try:
631                 os.mkdir(maildir)
632             except:
633                 sys.stderr.write("Couldn't create root maildir %s\n" \
634                     %(maildir))
635                 sys.exit(1)
636             try:
637                 os.mkdir(os.path.join(maildir, "new"))
638                 os.mkdir(os.path.join(maildir, "cur"))
639                 os.mkdir(os.path.join(maildir, "tmp"))
640             except:
641                 sys.stderr.write( \
642                     "Couldn't create required maildir directories for %s\n" \
643                     %(section,))
644                 sys.exit(1)
645
646         # right - we've got the directories, we've got the section, we know the
647         # url... lets play!
648
649         parse_and_deliver(maildir, section, state_dir)