* Move some of the list handling above the paragraph handling so that it
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 class HTML2Text(HTMLParser):
50     entities = {
51         "amp": "&",
52         "lt": "<",
53         "gt": ">",
54         "pound": "£",
55         "copy": "©",
56         "apos": "'",
57         "quot": "\"",
58         "nbsp": " ",
59         }
60
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             self.handle_br()
92         elif tag.lower() == "blockquote":
93             self.inblockquote = True
94             self.text = self.text + u'\n'
95         elif tag.lower() == "p":
96             if self.text != "":
97                 self.text = self.text + u'\n\n'
98             if self.inparagraph:
99                 self.text = self.text \
100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101             self.currentparagraph = u''
102             self.inparagraph = True
103         elif tag.lower() == "pre":
104             self.text = self.text + "\n"
105             self.inpre = True
106             self.inparagraph = False
107             self.inblockquote = False
108         elif tag.lower() == "ul":
109             self.item = u''
110             self.inul = True
111             self.text = self.text + "\n"
112         elif tag.lower() == "li":
113             if not self.initem:
114                 self.initem = True
115                 self.item = u''
116             else:
117                 self.text = self.text \
118                     + u' * ' \
119                     + u'\n   '.join([a.strip() for a in \
120                         textwrap.wrap(self.item, 67)]) \
121                     + u'\n'
122                 self.item = u''
123                 self.initem = True
124
125     def handle_startendtag(self, tag, attrs):
126         if tag.lower() == "br":
127             self.handle_br()
128
129     def handle_br(self):
130             if self.inparagraph:
131                 self.text = self.text \
132                 + u'\n'.join( \
133                     [a \
134                         for a in textwrap.wrap( \
135                             self.currentparagraph, 70) \
136                     ] \
137                 ) \
138                 + u'\n'
139                 self.currentparagraph = u''
140             elif self.inblockquote:
141                 self.text = self.text \
142                     + u'\n> ' \
143                     + u'\n> '.join( \
144                         [a \
145                             for a in textwrap.wrap( \
146                                 self.blockquote.encode("utf-8") \
147                                 , 68) \
148                         ] \
149                     ) \
150                     + u'\n'
151                 self.blockquote = u''
152             else:
153                 self.text = self.text + "\n"
154
155     def handle_endtag(self, tag):
156         if tag.lower() == "h1":
157             self.inheadingone = False
158             self.text = self.text \
159                 + u'\n\n' \
160                 + self.headingtext.encode("utf-8") \
161                 + u'\n' \
162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
163             self.headingtext = u''
164         elif tag.lower() == "h2":
165             self.inheadingtwo = False
166             self.text = self.text \
167                 + u'\n\n' \
168                 + self.headingtext.encode("utf-8") \
169                 + u'\n' \
170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
171             self.headingtext = u''
172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173             self.inotherheading = False
174             self.text = self.text \
175                 + u'\n\n' \
176                 + self.headingtext.encode("utf-8") \
177                 + u'\n' \
178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
179             self.headingtext = u''
180         elif tag.lower() == "p":
181             self.text = self.text \
182                 + u'\n'.join(textwrap.wrap( \
183                     self.currentparagraph, 70) \
184                 )
185             self.inparagraph = False
186             self.currentparagraph = u''
187         elif tag.lower() == "blockquote":
188             self.text = self.text \
189                 + u'\n> ' \
190                 + u'\n> '.join( \
191                     [a.strip() \
192                         for a in textwrap.wrap( \
193                             self.blockquote, 68)] \
194                     ) \
195                 + u'\n'
196             self.inblockquote = False
197             self.blockquote = u''
198         elif tag.lower() == "pre":
199             self.inpre = False
200         elif tag.lower() == "li":
201             self.initem = False
202             if self.item != u'':
203                 self.text = self.text \
204                     + u' * ' \
205                     + u'\n   '.join( \
206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
207                     + u'\n'
208             self.item = u''
209         elif tag.lower() == "ul":
210             self.inul = False
211
212     def handle_data(self, data):
213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
214             self.headingtext = self.headingtext \
215                 + unicode(data, "utf-8").strip() \
216                 + u' '
217         elif self.inblockquote:
218             self.blockquote = self.blockquote \
219                 + unicode(data, "utf-8").strip() \
220                 + u' '
221         elif self.initem:
222             self.item = self.item + unicode(data, "utf-8")
223         elif self.inparagraph:
224             self.currentparagraph = self.currentparagraph \
225                 + unicode(data, "utf-8").strip() \
226                 + u' '
227         elif self.inpre:
228             self.text = self.text + unicode(data, "utf-8")
229         else:
230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
231
232     def handle_entityref(self, name):
233         entity = name
234         if HTML2Text.entities.has_key(name.lower()):
235             entity = HTML2Text.entities[name.lower()]
236         elif name[0] == "#":
237             entity = unichr(int(name[1:]))
238         else:
239             entity = "&" + name + ";"
240
241         if self.inparagraph:
242             self.currentparagraph = self.currentparagraph \
243                 + unicode(entity, "utf-8")
244         elif self.inblockquote:
245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
246         else:
247             self.text = self.text + unicode(entity, "utf-8")
248
249     def gettext(self):
250         data = self.text
251         if self.inparagraph:
252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
253         if data[-1] != '\n':
254             data = data + '\n'
255         return data
256
257 def open_url(method, url):
258     redirectcount = 0
259     while redirectcount < 3:
260         (type, rest) = urllib.splittype(url)
261         (host, path) = urllib.splithost(rest)
262         (host, port) = urllib.splitport(host)
263         if port == None:
264             port = 80
265         try:
266             conn = httplib.HTTPConnection("%s:%s" %(host, port))
267             conn.request(method, path)
268             response = conn.getresponse()
269             if response.status in [301, 302, 303, 307]:
270                 headers = response.getheaders()
271                 for header in headers:
272                     if header[0] == "location":
273                         url = header[1]
274             elif response.status == 200:
275                 return response
276         except:
277             pass
278         redirectcount = redirectcount + 1
279     return None
280
281 def parse_and_deliver(maildir, url, statedir):
282     feedhandle = None
283     headers = None
284     # first check if we know about this feed already
285     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
286     if feeddb.has_key(url):
287         data = feeddb[url]
288         data = cgi.parse_qs(data)
289         response = open_url("HEAD", url)
290         headers = None
291         if response:
292             headers = response.getheaders()
293         ischanged = False
294         try:
295             for header in headers:
296                 if header[0] == "content-length":
297                     if header[1] != data["content-length"][0]:
298                         ischanged = True
299                 elif header[0] == "etag":
300                     if header[1] != data["etag"][0]:
301                         ischanged = True
302                 elif header[0] == "last-modified":
303                     if header[1] != data["last-modified"][0]:
304                         ischanged = True
305                 elif header[0] == "content-md5":
306                     if header[1] != data["content-md5"][0]:
307                         ischanged = True
308         except:
309             ischanged = True
310         if ischanged:
311             response = open_url("GET", url)
312             if response != None:
313                 headers = response.getheaders()
314                 feedhandle = response
315             else:
316                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
317                 return
318         else:
319             return # don't need to do anything, nothings changed.
320     else:
321         response = open_url("GET", url)
322         if response != None:
323             headers = response.getheaders()
324             feedhandle = response
325         else:
326             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
327             return
328
329     fp = feedparser.parse(feedhandle)
330     db = dbm.open(os.path.join(statedir, "seen"), "c")
331     for item in fp["items"]:
332         # have we seen it before?
333         # need to work out what the content is first...
334
335         if item.has_key("content"):
336             content = item["content"][0]["value"]
337         else:
338             content = item["summary"]
339
340         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
341
342         prevmessageid = None
343
344         # check if there's a guid too - if that exists and we match the md5,
345         # return
346         if item.has_key("guid"):
347             if db.has_key(url + "|" + item["guid"]):
348                 data = db[url + "|" + item["guid"]]
349                 data = cgi.parse_qs(data)
350                 if data["contentmd5"][0] == md5sum:
351                     continue
352
353         if db.has_key(url + "|" + item["link"]):
354             data = db[url + "|" + item["link"]]
355             data = cgi.parse_qs(data)
356             if data.has_key("message-id"):
357                 prevmessageid = data["message-id"][0]
358             if data["contentmd5"][0] == md5sum:
359                 continue
360
361         try:
362             author = item["author"]
363         except:
364             author = url
365
366         # create a basic email message
367         msg = MIMEMultipart("alternative")
368         messageid = "<" \
369             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
370             + "." \
371             + "".join( \
372                 [random.choice( \
373                     string.ascii_letters + string.digits \
374                     ) for a in range(0,6) \
375                 ]) + "@" + socket.gethostname() + ">"
376         msg.add_header("Message-ID", messageid)
377         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
378         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
379         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
380         if prevmessageid:
381             msg.add_header("References", prevmessageid)
382         createddate = datetime.datetime.now() \
383             .strftime("%a, %e %b %Y %T -0000")
384         try:
385             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
386                 .strftime("%a, %e %b %Y %T -0000")
387         except:
388             pass
389         msg.add_header("Date", createddate)
390         msg.add_header("Subject", item["title"])
391         msg.set_default_type("text/plain")
392
393         htmlcontent = content.encode("utf-8")
394         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
395             content, \
396             item["link"], \
397             item["link"] )
398         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
399         textparser = HTML2Text()
400         textparser.feed(content.encode("utf-8"))
401         textcontent = textparser.gettext()
402         textcontent = "%s\n\nItem URL: %s" %( \
403             textcontent, \
404             item["link"] )
405         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
406         msg.attach(textpart)
407         msg.attach(htmlpart)
408
409         # start by working out the filename we should be writting to, we do
410         # this following the normal maildir style rules
411         fname = str(os.getpid()) \
412             + "." + socket.gethostname() \
413             + "." + "".join( \
414                 [random.choice( \
415                     string.ascii_letters + string.digits \
416                     ) for a in range(0,10) \
417                 ]) + "." \
418             + datetime.datetime.now().strftime('%s')
419         fn = os.path.join(maildir, "tmp", fname)
420         fh = open(fn, "w")
421         fh.write(msg.as_string())
422         fh.close()
423         # now move it in to the new directory
424         newfn = os.path.join(maildir, "new", fname)
425         os.link(fn, newfn)
426         os.unlink(fn)
427
428         # now add to the database about the item
429         if prevmessageid:
430             messageid = prevmessageid + " " + messageid
431         if item.has_key("guid") and item["guid"] != item["link"]:
432             data = urllib.urlencode(( \
433                 ("message-id", messageid), \
434                 ("created", createddate), \
435                 ("contentmd5", md5sum) \
436                 ))
437             db[url + "|" + item["guid"]] = data
438             try:
439                 data = db[url + "|" + item["link"]]
440                 data = cgi.parse_qs(data)
441                 newdata = urllib.urlencode(( \
442                     ("message-id", messageid), \
443                     ("created", data["created"][0]), \
444                     ("contentmd5", data["contentmd5"][0]) \
445                     ))
446                 db[url + "|" + item["link"]] = newdata
447             except:
448                 db[url + "|" + item["link"]] = data
449         else:
450             data = urllib.urlencode(( \
451                 ("message-id", messageid), \
452                 ("created", createddate), \
453                 ("contentmd5", md5sum) \
454                 ))
455             db[url + "|" + item["link"]] = data
456
457     if headers:
458         data = []
459         for header in headers:
460             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
461                 data.append((header[0], header[1]))
462         if len(data) > 0:
463             data = urllib.urlencode(data)
464             feeddb[url] = data
465
466     db.close()
467     feeddb.close()
468
469 if __name__ == "__main__":
470     # This only gets executed if we really called the program
471     # first off, parse the command line arguments
472
473     oparser = OptionParser()
474     oparser.add_option(
475         "-c", "--conf", dest="conf",
476         help="location of config file"
477         )
478     oparser.add_option(
479         "-s", "--statedir", dest="statedir",
480         help="location of directory to store state in"
481         )
482
483     (options, args) = oparser.parse_args()
484
485     # check for the configfile
486
487     configfile = None
488
489     if options.conf != None:
490         # does the file exist?
491         try:
492             os.stat(options.conf)
493             configfile = options.conf
494         except:
495             # should exit here as the specified file doesn't exist
496             sys.stderr.write( \
497                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
498             sys.exit(2)
499     else:
500         # check through the default locations
501         try:
502             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
503             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
504         except:
505             try:
506                 os.stat("/etc/rss2maildir.conf")
507                 configfile = "/etc/rss2maildir.conf"
508             except:
509                 sys.stderr.write("No config file found. Exiting.\n")
510                 sys.exit(2)
511
512     # Right - if we've got this far, we've got a config file, now for the hard
513     # bits...
514
515     scp = SafeConfigParser()
516     scp.read(configfile)
517
518     maildir_root = "RSSMaildir"
519     state_dir = "state"
520
521     if options.statedir != None:
522         state_dir = options.statedir
523         try:
524             mode = os.stat(state_dir)[stat.ST_MODE]
525             if not stat.S_ISDIR(mode):
526                 sys.stderr.write( \
527                     "State directory (%s) is not a directory\n" %(state_dir))
528                 sys.exit(1)
529         except:
530             # try to make the directory
531             try:
532                 os.mkdir(state_dir)
533             except:
534                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
535                 sys.exit(1)
536     elif scp.has_option("general", "state_dir"):
537         new_state_dir = scp.get("general", "state_dir")
538         try:
539             mode = os.stat(state_dir)[stat.ST_MODE]
540             if not stat.S_ISDIR(mode):
541                 sys.stderr.write( \
542                     "State directory (%s) is not a directory\n" %(state_dir))
543                 sys.exit(1)
544         except:
545             # try to create it
546             try:
547                 os.mkdir(new_state_dir)
548                 state_dir = new_state_dir
549             except:
550                 sys.stderr.write( \
551                     "Couldn't create state directory %s\n" %(new_state_dir))
552                 sys.exit(1)
553     else:
554         try:
555             mode = os.stat(state_dir)[stat.ST_MODE]
556             if not stat.S_ISDIR(mode):
557                 sys.stderr.write( \
558                     "State directory %s is not a directory\n" %(state_dir))
559                 sys.exit(1)
560         except:
561             try:
562                 os.mkdir(state_dir)
563             except:
564                 sys.stderr.write( \
565                     "State directory %s could not be created\n" %(state_dir))
566                 sys.exit(1)
567
568     if scp.has_option("general", "maildir_root"):
569         maildir_root = scp.get("general", "maildir_root")
570
571     try:
572         mode = os.stat(maildir_root)[stat.ST_MODE]
573         if not stat.S_ISDIR(mode):
574             sys.stderr.write( \
575                 "Maildir Root %s is not a directory\n" \
576                 %(maildir_root))
577             sys.exit(1)
578     except:
579         try:
580             os.mkdir(maildir_root)
581         except:
582             sys.stderr.write("Couldn't create Maildir Root %s\n" \
583                 %(maildir_root))
584             sys.exit(1)
585
586     feeds = scp.sections()
587     try:
588         feeds.remove("general")
589     except:
590         pass
591
592     for section in feeds:
593         # check if the directory exists
594         maildir = None
595         try:
596             maildir = scp.get(section, "maildir")
597         except:
598             maildir = section
599
600         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
601         maildir = os.path.join(maildir_root, maildir)
602
603         try:
604             exists = os.stat(maildir)
605             if stat.S_ISDIR(exists[stat.ST_MODE]):
606                 # check if there's a new, cur and tmp directory
607                 try:
608                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
609                 except:
610                     os.mkdir(os.path.join(maildir, "cur"))
611                     if not stat.S_ISDIR(mode):
612                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
613                 try:
614                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
615                 except:
616                     os.mkdir(os.path.join(maildir, "tmp"))
617                     if not stat.S_ISDIR(mode):
618                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
619                 try:
620                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
621                     if not stat.S_ISDIR(mode):
622                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
623                 except:
624                     os.mkdir(os.path.join(maildir, "new"))
625             else:
626                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
627         except:
628             try:
629                 os.mkdir(maildir)
630             except:
631                 sys.stderr.write("Couldn't create root maildir %s\n" \
632                     %(maildir))
633                 sys.exit(1)
634             try:
635                 os.mkdir(os.path.join(maildir, "new"))
636                 os.mkdir(os.path.join(maildir, "cur"))
637                 os.mkdir(os.path.join(maildir, "tmp"))
638             except:
639                 sys.stderr.write( \
640                     "Couldn't create required maildir directories for %s\n" \
641                     %(section,))
642                 sys.exit(1)
643
644         # right - we've got the directories, we've got the section, we know the
645         # url... lets play!
646
647         parse_and_deliver(maildir, section, state_dir)