Reformat code ready for adding test suite
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             self.handle_br()
92         elif tag.lower() == "blockquote":
93             self.inblockquote = True
94             self.text = self.text + u'\n'
95         elif tag.lower() == "p":
96             if self.text != "":
97                 self.text = self.text + u'\n\n'
98             if self.inparagraph:
99                 self.text = self.text \
100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101             self.currentparagraph = u''
102             self.inparagraph = True
103         elif tag.lower() == "pre":
104             self.text = self.text + "\n"
105             self.inpre = True
106             self.inparagraph = False
107             self.inblockquote = False
108         elif tag.lower() == "ul":
109             self.item = u''
110             self.inul = True
111             self.text = self.text + "\n"
112         elif tag.lower() == "li" and self.inul:
113             if not self.initem:
114                 self.initem = True
115                 self.item = u''
116             else:
117                 self.text = self.text \
118                     + u' * ' \
119                     + u'\n   '.join([a.strip() for a in \
120                         textwrap.wrap(self.item, 67)]) \
121                     + u'\n'
122                 self.item = u''
123
124     def handle_startendtag(self, tag, attrs):
125         if tag.lower() == "br":
126             self.handle_br()
127
128     def handle_br(self):
129             if self.inparagraph:
130                 self.text = self.text \
131                 + u'\n'.join( \
132                     [a \
133                         for a in textwrap.wrap( \
134                             self.currentparagraph, 70) \
135                     ] \
136                 ) \
137                 + u'\n'
138                 self.currentparagraph = u''
139             elif self.inblockquote:
140                 self.text = self.text \
141                     + u'\n> ' \
142                     + u'\n> '.join( \
143                         [a \
144                             for a in textwrap.wrap( \
145                                 self.blockquote.encode("utf-8") \
146                                 , 68) \
147                         ] \
148                     ) \
149                     + u'\n'
150                 self.blockquote = u''
151             else:
152                 self.text = self.text + "\n"
153
154     def handle_endtag(self, tag):
155         if tag.lower() == "h1":
156             self.inheadingone = False
157             self.text = self.text \
158                 + u'\n\n' \
159                 + self.headingtext.encode("utf-8") \
160                 + u'\n' \
161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
162             self.headingtext = u''
163         elif tag.lower() == "h2":
164             self.inheadingtwo = False
165             self.text = self.text \
166                 + u'\n\n' \
167                 + self.headingtext.encode("utf-8") \
168                 + u'\n' \
169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
170             self.headingtext = u''
171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
172             self.inotherheading = False
173             self.text = self.text \
174                 + u'\n\n' \
175                 + self.headingtext.encode("utf-8") \
176                 + u'\n' \
177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
178             self.headingtext = u''
179         elif tag.lower() == "p":
180             self.text = self.text \
181                 + u'\n'.join(textwrap.wrap( \
182                     self.currentparagraph, 70) \
183                 )
184             self.inparagraph = False
185             self.currentparagraph = u''
186         elif tag.lower() == "blockquote":
187             self.text = self.text \
188                 + u'\n> ' \
189                 + u'\n> '.join( \
190                     [a.strip() \
191                         for a in textwrap.wrap( \
192                             self.blockquote, 68)] \
193                     ) \
194                 + u'\n'
195             self.inblockquote = False
196             self.blockquote = u''
197         elif tag.lower() == "pre":
198             self.inpre = False
199         elif tag.lower() == "li":
200             self.initem = False
201             if self.item != "":
202                 self.text = self.text \
203                     + u' * ' \
204                     + u'\n   '.join( \
205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
206                     + u'\n'
207             self.item = u''
208         elif tag.lower() == "ul":
209             self.inul = False
210
211     def handle_data(self, data):
212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
213             self.headingtext = self.headingtext \
214                 + unicode(data, "utf-8").strip() \
215                 + u' '
216         elif self.inblockquote:
217             self.blockquote = self.blockquote \
218                 + unicode(data, "utf-8").strip() \
219                 + u' '
220         elif self.inparagraph:
221             self.currentparagraph = self.currentparagraph \
222                 + unicode(data, "utf-8").strip() \
223                 + u' '
224         elif self.inul and self.initem:
225             self.item = self.item + unicode(data, "utf-8")
226         elif self.inpre:
227             self.text = self.text + unicode(data, "utf-8")
228         else:
229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
230
231     def handle_entityref(self, name):
232         entity = name
233         if HTML2Text.entities.has_key(name.lower()):
234             entity = HTML2Text.entities[name.lower()]
235         elif name[0] == "#":
236             entity = unichr(int(name[1:]))
237         else:
238             entity = "&" + name + ";"
239
240         if self.inparagraph:
241             self.currentparagraph = self.currentparagraph \
242                 + unicode(entity, "utf-8")
243         elif self.inblockquote:
244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
245         else:
246             self.text = self.text + unicode(entity, "utf-8")
247
248     def gettext(self):
249         data = self.text
250         if self.inparagraph:
251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
252         return data
253
254 def open_url(method, url):
255     redirectcount = 0
256     while redirectcount < 3:
257         (type, rest) = urllib.splittype(url)
258         (host, path) = urllib.splithost(rest)
259         (host, port) = urllib.splitport(host)
260         if port == None:
261             port = 80
262         try:
263             conn = httplib.HTTPConnection("%s:%s" %(host, port))
264             conn.request(method, path)
265             response = conn.getresponse()
266             if response.status in [301, 302, 303, 307]:
267                 headers = response.getheaders()
268                 for header in headers:
269                     if header[0] == "location":
270                         url = header[1]
271             elif response.status == 200:
272                 return response
273         except:
274             pass
275         redirectcount = redirectcount + 1
276     return None
277
278 def parse_and_deliver(maildir, url, statedir):
279     feedhandle = None
280     headers = None
281     # first check if we know about this feed already
282     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
283     if feeddb.has_key(url):
284         data = feeddb[url]
285         data = cgi.parse_qs(data)
286         response = open_url("HEAD", url)
287         headers = None
288         if response:
289             headers = response.getheaders()
290         ischanged = False
291         try:
292             for header in headers:
293                 if header[0] == "content-length":
294                     if header[1] != data["content-length"][0]:
295                         ischanged = True
296                 elif header[0] == "etag":
297                     if header[1] != data["etag"][0]:
298                         ischanged = True
299                 elif header[0] == "last-modified":
300                     if header[1] != data["last-modified"][0]:
301                         ischanged = True
302                 elif header[0] == "content-md5":
303                     if header[1] != data["content-md5"][0]:
304                         ischanged = True
305         except:
306             ischanged = True
307         if ischanged:
308             response = open_url("GET", url)
309             if response != None:
310                 headers = response.getheaders()
311                 feedhandle = response
312             else:
313                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
314                 return
315         else:
316             return # don't need to do anything, nothings changed.
317     else:
318         response = open_url("GET", url)
319         if response != None:
320             headers = response.getheaders()
321             feedhandle = response
322         else:
323             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
324             return
325
326     fp = feedparser.parse(feedhandle)
327     db = dbm.open(os.path.join(statedir, "seen"), "c")
328     for item in fp["items"]:
329         # have we seen it before?
330         # need to work out what the content is first...
331
332         if item.has_key("content"):
333             content = item["content"][0]["value"]
334         else:
335             content = item["summary"]
336
337         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
338
339         prevmessageid = None
340
341         # check if there's a guid too - if that exists and we match the md5,
342         # return
343         if item.has_key("guid"):
344             if db.has_key(url + "|" + item["guid"]):
345                 data = db[url + "|" + item["guid"]]
346                 data = cgi.parse_qs(data)
347                 if data["contentmd5"][0] == md5sum:
348                     continue
349
350         if db.has_key(url + "|" + item["link"]):
351             data = db[url + "|" + item["link"]]
352             data = cgi.parse_qs(data)
353             if data.has_key("message-id"):
354                 prevmessageid = data["message-id"][0]
355             if data["contentmd5"][0] == md5sum:
356                 continue
357
358         try:
359             author = item["author"]
360         except:
361             author = url
362
363         # create a basic email message
364         msg = MIMEMultipart("alternative")
365         messageid = "<" \
366             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
367             + "." \
368             + "".join( \
369                 [random.choice( \
370                     string.ascii_letters + string.digits \
371                     ) for a in range(0,6) \
372                 ]) + "@" + socket.gethostname() + ">"
373         msg.add_header("Message-ID", messageid)
374         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
375         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
376         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
377         if prevmessageid:
378             msg.add_header("References", prevmessageid)
379         createddate = datetime.datetime.now() \
380             .strftime("%a, %e %b %Y %T -0000")
381         try:
382             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
383                 .strftime("%a, %e %b %Y %T -0000")
384         except:
385             pass
386         msg.add_header("Date", createddate)
387         msg.add_header("Subject", item["title"])
388         msg.set_default_type("text/plain")
389
390         htmlcontent = content.encode("utf-8")
391         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
392             content, \
393             item["link"], \
394             item["link"] )
395         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
396         textparser = HTML2Text()
397         textparser.feed(content.encode("utf-8"))
398         textcontent = textparser.gettext()
399         textcontent = "%s\n\nItem URL: %s" %( \
400             textcontent, \
401             item["link"] )
402         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
403         msg.attach(textpart)
404         msg.attach(htmlpart)
405
406         # start by working out the filename we should be writting to, we do
407         # this following the normal maildir style rules
408         fname = str(os.getpid()) \
409             + "." + socket.gethostname() \
410             + "." + "".join( \
411                 [random.choice( \
412                     string.ascii_letters + string.digits \
413                     ) for a in range(0,10) \
414                 ]) + "." \
415             + datetime.datetime.now().strftime('%s')
416         fn = os.path.join(maildir, "tmp", fname)
417         fh = open(fn, "w")
418         fh.write(msg.as_string())
419         fh.close()
420         # now move it in to the new directory
421         newfn = os.path.join(maildir, "new", fname)
422         os.link(fn, newfn)
423         os.unlink(fn)
424
425         # now add to the database about the item
426         if prevmessageid:
427             messageid = prevmessageid + " " + messageid
428         if item.has_key("guid") and item["guid"] != item["link"]:
429             data = urllib.urlencode(( \
430                 ("message-id", messageid), \
431                 ("created", createddate), \
432                 ("contentmd5", md5sum) \
433                 ))
434             db[url + "|" + item["guid"]] = data
435             try:
436                 data = db[url + "|" + item["link"]]
437                 data = cgi.parse_qs(data)
438                 newdata = urllib.urlencode(( \
439                     ("message-id", messageid), \
440                     ("created", data["created"][0]), \
441                     ("contentmd5", data["contentmd5"][0]) \
442                     ))
443                 db[url + "|" + item["link"]] = newdata
444             except:
445                 db[url + "|" + item["link"]] = data
446         else:
447             data = urllib.urlencode(( \
448                 ("message-id", messageid), \
449                 ("created", createddate), \
450                 ("contentmd5", md5sum) \
451                 ))
452             db[url + "|" + item["link"]] = data
453
454     if headers:
455         data = []
456         for header in headers:
457             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
458                 data.append((header[0], header[1]))
459         if len(data) > 0:
460             data = urllib.urlencode(data)
461             feeddb[url] = data
462
463     db.close()
464     feeddb.close()
465
466 if __name__ == "__main__":
467     # This only gets executed if we really called the program
468     # first off, parse the command line arguments
469
470     oparser = OptionParser()
471     oparser.add_option(
472         "-c", "--conf", dest="conf",
473         help="location of config file"
474         )
475     oparser.add_option(
476         "-s", "--statedir", dest="statedir",
477         help="location of directory to store state in"
478         )
479
480     (options, args) = oparser.parse_args()
481
482     # check for the configfile
483
484     configfile = None
485
486     if options.conf != None:
487         # does the file exist?
488         try:
489             os.stat(options.conf)
490             configfile = options.conf
491         except:
492             # should exit here as the specified file doesn't exist
493             sys.stderr.write( \
494                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
495             sys.exit(2)
496     else:
497         # check through the default locations
498         try:
499             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
500             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
501         except:
502             try:
503                 os.stat("/etc/rss2maildir.conf")
504                 configfile = "/etc/rss2maildir.conf"
505             except:
506                 sys.stderr.write("No config file found. Exiting.\n")
507                 sys.exit(2)
508
509     # Right - if we've got this far, we've got a config file, now for the hard
510     # bits...
511
512     scp = SafeConfigParser()
513     scp.read(configfile)
514
515     maildir_root = "RSSMaildir"
516     state_dir = "state"
517
518     if options.statedir != None:
519         state_dir = options.statedir
520         try:
521             mode = os.stat(state_dir)[stat.ST_MODE]
522             if not stat.S_ISDIR(mode):
523                 sys.stderr.write( \
524                     "State directory (%s) is not a directory\n" %(state_dir))
525                 sys.exit(1)
526         except:
527             # try to make the directory
528             try:
529                 os.mkdir(state_dir)
530             except:
531                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
532                 sys.exit(1)
533     elif scp.has_option("general", "state_dir"):
534         new_state_dir = scp.get("general", "state_dir")
535         try:
536             mode = os.stat(state_dir)[stat.ST_MODE]
537             if not stat.S_ISDIR(mode):
538                 sys.stderr.write( \
539                     "State directory (%s) is not a directory\n" %(state_dir))
540                 sys.exit(1)
541         except:
542             # try to create it
543             try:
544                 os.mkdir(new_state_dir)
545                 state_dir = new_state_dir
546             except:
547                 sys.stderr.write( \
548                     "Couldn't create state directory %s\n" %(new_state_dir))
549                 sys.exit(1)
550     else:
551         try:
552             mode = os.stat(state_dir)[stat.ST_MODE]
553             if not stat.S_ISDIR(mode):
554                 sys.stderr.write( \
555                     "State directory %s is not a directory\n" %(state_dir))
556                 sys.exit(1)
557         except:
558             try:
559                 os.mkdir(state_dir)
560             except:
561                 sys.stderr.write( \
562                     "State directory %s could not be created\n" %(state_dir))
563                 sys.exit(1)
564
565     if scp.has_option("general", "maildir_root"):
566         maildir_root = scp.get("general", "maildir_root")
567
568     try:
569         mode = os.stat(maildir_root)[stat.ST_MODE]
570         if not stat.S_ISDIR(mode):
571             sys.stderr.write( \
572                 "Maildir Root %s is not a directory\n" \
573                 %(maildir_root))
574             sys.exit(1)
575     except:
576         try:
577             os.mkdir(maildir_root)
578         except:
579             sys.stderr.write("Couldn't create Maildir Root %s\n" \
580                 %(maildir_root))
581             sys.exit(1)
582
583     feeds = scp.sections()
584     try:
585         feeds.remove("general")
586     except:
587         pass
588
589     for section in feeds:
590         # check if the directory exists
591         maildir = None
592         try:
593             maildir = scp.get(section, "maildir")
594         except:
595             maildir = section
596
597         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
598         maildir = os.path.join(maildir_root, maildir)
599
600         try:
601             exists = os.stat(maildir)
602             if stat.S_ISDIR(exists[stat.ST_MODE]):
603                 # check if there's a new, cur and tmp directory
604                 try:
605                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
606                 except:
607                     os.mkdir(os.path.join(maildir, "cur"))
608                     if not stat.S_ISDIR(mode):
609                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
610                 try:
611                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
612                 except:
613                     os.mkdir(os.path.join(maildir, "tmp"))
614                     if not stat.S_ISDIR(mode):
615                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
616                 try:
617                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
618                     if not stat.S_ISDIR(mode):
619                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
620                 except:
621                     os.mkdir(os.path.join(maildir, "new"))
622             else:
623                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
624         except:
625             try:
626                 os.mkdir(maildir)
627             except:
628                 sys.stderr.write("Couldn't create root maildir %s\n" \
629                     %(maildir))
630                 sys.exit(1)
631             try:
632                 os.mkdir(os.path.join(maildir, "new"))
633                 os.mkdir(os.path.join(maildir, "cur"))
634                 os.mkdir(os.path.join(maildir, "tmp"))
635             except:
636                 sys.stderr.write( \
637                     "Couldn't create required maildir directories for %s\n" \
638                     %(section,))
639                 sys.exit(1)
640
641         # right - we've got the directories, we've got the section, we know the
642         # url... lets play!
643
644         parse_and_deliver(maildir, section, state_dir)