* fix typo for a particular entity
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import httplib
24 import urllib
25
26 import feedparser
27
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
30
31 import datetime
32 import random
33 import string
34 import textwrap
35
36 import socket
37
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
40
41 from base64 import b64encode
42 import md5
43
44 import cgi
45 import dbm
46
47 from HTMLParser import HTMLParser
48
49 entities = {
50     "amp": "&",
51     "lt": "<",
52     "gt": ">",
53     "pound": "£",
54     "copy": "©",
55     "apos": "'",
56     "quot": "\"",
57     "nbsp": " ",
58     }
59
60 class HTML2Text(HTMLParser):
61
62     def __init__(self):
63         self.inheadingone = False
64         self.inheadingtwo = False
65         self.inotherheading = False
66         self.inparagraph = True
67         self.inblockquote = False
68         self.inlink = False
69         self.text = u''
70         self.currentparagraph = u''
71         self.headingtext = u''
72         self.blockquote = u''
73         self.inpre = False
74         self.inul = False
75         self.initem = False
76         self.item = u''
77         HTMLParser.__init__(self)
78
79     def handle_starttag(self, tag, attrs):
80         if tag.lower() == "h1":
81             self.inheadingone = True
82             self.inparagraph = False
83         elif tag.lower() == "h2":
84             self.inheadingtwo = True
85             self.inparagraph = False
86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87             self.inotherheading = True
88             self.inparagraph = False
89         elif tag.lower() == "a":
90             self.inlink = True
91         elif tag.lower() == "br":
92             self.handle_br()
93         elif tag.lower() == "blockquote":
94             self.inblockquote = True
95             self.text = self.text + u'\n'
96         elif tag.lower() == "p":
97             if self.text != "":
98                 self.text = self.text + u'\n\n'
99             if self.inparagraph:
100                 self.text = self.text \
101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
102             self.currentparagraph = u''
103             self.inparagraph = True
104         elif tag.lower() == "pre":
105             self.text = self.text + "\n"
106             self.inpre = True
107             self.inparagraph = False
108             self.inblockquote = False
109         elif tag.lower() == "ul":
110             self.item = u''
111             self.inul = True
112             self.text = self.text + "\n"
113         elif tag.lower() == "li" and self.inul:
114             if not self.initem:
115                 self.initem = True
116                 self.item = u''
117             else:
118                 self.text = self.text \
119                     + u' * ' \
120                     + u'\n   '.join([a.strip() for a in \
121                         textwrap.wrap(self.item, 67)]) \
122                     + u'\n'
123                 self.item = u''
124
125     def handle_startendtag(self, tag, attrs):
126         if tag.lower() == "br":
127             self.handle_br()
128
129     def handle_br(self):
130             if self.inparagraph:
131                 self.text = self.text \
132                 + u'\n'.join( \
133                     [a \
134                         for a in textwrap.wrap( \
135                             self.currentparagraph, 70) \
136                     ] \
137                 ) \
138                 + u'\n'
139                 self.currentparagraph = u''
140             elif self.inblockquote:
141                 self.text = self.text \
142                     + u'\n> ' \
143                     + u'\n> '.join( \
144                         [a \
145                             for a in textwrap.wrap( \
146                                 self.blockquote.encode("utf-8") \
147                                 , 68) \
148                         ] \
149                     ) \
150                     + u'\n'
151                 self.blockquote = u''
152             else:
153                 self.text = self.text + "\n"
154
155     def handle_endtag(self, tag):
156         if tag.lower() == "h1":
157             self.inheadingone = False
158             self.text = self.text \
159                 + u'\n\n' \
160                 + self.headingtext.encode("utf-8") \
161                 + u'\n' \
162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
163             self.headingtext = u''
164         elif tag.lower() == "h2":
165             self.inheadingtwo = False
166             self.text = self.text \
167                 + u'\n\n' \
168                 + self.headingtext.encode("utf-8") \
169                 + u'\n' \
170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
171             self.headingtext = u''
172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173             self.inotherheading = False
174             self.text = self.text \
175                 + u'\n\n' \
176                 + self.headingtext.encode("utf-8") \
177                 + u'\n' \
178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
179             self.headingtext = u''
180         elif tag.lower() == "p":
181             self.text = self.text \
182                 + u'\n'.join(textwrap.wrap( \
183                     self.currentparagraph, 70) \
184                 )
185             self.inparagraph = False
186             self.currentparagraph = u''
187         elif tag.lower() == "blockquote":
188             self.text = self.text \
189                 + u'\n> ' \
190                 + u'\n> '.join( \
191                     [a.strip() \
192                         for a in textwrap.wrap( \
193                             self.blockquote, 68)] \
194                     ) \
195                 + u'\n'
196             self.inblockquote = False
197             self.blockquote = u''
198         elif tag.lower() == "pre":
199             self.inpre = False
200         elif tag.lower() == "li":
201             self.initem = False
202             if self.item != "":
203                 self.text = self.text \
204                     + u' * ' \
205                     + u'\n   '.join( \
206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
207                     + u'\n'
208             self.item = u''
209         elif tag.lower() == "ul":
210             self.inul = False
211
212     def handle_data(self, data):
213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
214             self.headingtext = self.headingtext \
215                 + unicode(data, "utf-8").strip() \
216                 + u' '
217         elif self.inblockquote:
218             self.blockquote = self.blockquote \
219                 + unicode(data, "utf-8").strip() \
220                 + u' '
221         elif self.inparagraph:
222             self.currentparagraph = self.currentparagraph \
223                 + unicode(data, "utf-8").strip() \
224                 + u' '
225         elif self.inul and self.initem:
226             self.item = self.item + unicode(data, "utf-8")
227         elif self.inpre:
228             self.text = self.text + unicode(data, "utf-8")
229         else:
230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
231
232     def handle_entityref(self, name):
233         entity = name
234         if entities.has_key(name.lower()):
235             entity = entities[name.lower()]
236         elif name[0] == "#":
237             entity = unichr(int(name[1:]))
238         else:
239             entity = "&" + name + ";"
240
241         if self.inparagraph:
242             self.currentparagraph = self.currentparagraph \
243                 + unicode(entity, "utf-8")
244         elif self.inblockquote:
245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
246         else:
247             self.text = self.text + unicode(entity, "utf-8")
248
249     def gettext(self):
250         data = self.text
251         if self.inparagraph:
252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
253         return data
254
255 def parse_and_deliver(maildir, url, statedir):
256     feedhandle = None
257     headers = None
258     # first check if we know about this feed already
259     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
260     # we need all the parts of the url 
261     (type, rest) = urllib.splittype(url)
262     (host, path) = urllib.splithost(rest)
263     (host, port) = urllib.splitport(host)
264     if port == None:
265         port = 80
266     if feeddb.has_key(url):
267         data = feeddb[url]
268         data = cgi.parse_qs(data)
269         # now do a head on the feed to see if it's been updated
270         conn = httplib.HTTPConnection("%s:%s" %(host, port))
271         conn.request("HEAD", path)
272         response = conn.getresponse()
273         headers = response.getheaders()
274         ischanged = False
275         try:
276             for header in headers:
277                 if header[0] == "content-length":
278                     if header[1] != data["content-length"][0]:
279                         ischanged = True
280                 elif header[0] == "etag":
281                     if header[1] != data["etag"][0]:
282                         ischanged = True
283                 elif header[0] == "last-modified":
284                     if header[1] != data["last-modified"][0]:
285                         ischanged = True
286                 elif header[0] == "content-md5":
287                     if header[1] != data["content-md5"][0]:
288                         ischanged = True
289         except:
290             ischanged = True
291         if ischanged:
292             conn = httplib.HTTPConnection("%s:%s" %(host, port))
293             conn.request("GET", path)
294             response = conn.getresponse()
295             headers = response.getheaders()
296             feedhandle = response
297         else:
298             return # don't need to do anything, nothings changed.
299     else:
300         conn = httplib.HTTPConnection("%s:%s" %(host, port))
301         conn.request("GET", path)
302         response = None
303         try:
304             response = conn.getresponse()
305         except:
306             print "Failed to fetch feed: %s" %(url)
307             return
308         headers = response.getheaders()
309         feedhandle = response
310
311     fp = feedparser.parse(feedhandle)
312     db = dbm.open(os.path.join(statedir, "seen"), "c")
313     for item in fp["items"]:
314         # have we seen it before?
315         # need to work out what the content is first...
316
317         if item.has_key("content"):
318             content = item["content"][0]["value"]
319         else:
320             content = item["summary"]
321
322         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
323
324         prevmessageid = None
325
326         if db.has_key(url + "|" + item["link"]):
327             data = db[url + "|" + item["link"]]
328             data = cgi.parse_qs(data)
329             if data.has_key("message-id"):
330                 prevmessageid = data["message-id"][0]
331             if data["contentmd5"][0] == md5sum:
332                 continue
333
334         try:
335             author = item["author"]
336         except:
337             author = url
338
339         # create a basic email message
340         msg = MIMEMultipart("alternative")
341         messageid = "<" \
342             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
343             + "." \
344             + "".join( \
345                 [random.choice( \
346                     string.ascii_letters + string.digits \
347                     ) for a in range(0,6) \
348                 ]) + "@" + socket.gethostname() + ">"
349         msg.add_header("Message-ID", messageid)
350         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
351         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
352         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
353         if prevmessageid:
354             msg.add_header("References", prevmessageid)
355         createddate = datetime.datetime.now() \
356             .strftime("%a, %e %b %Y %T -0000")
357         try:
358             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
359                 .strftime("%a, %e %b %Y %T -0000")
360         except:
361             pass
362         msg.add_header("Date", createddate)
363         msg.add_header("Subject", item["title"])
364         msg.set_default_type("text/plain")
365
366         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
367         textparser = HTML2Text()
368         textparser.feed(content.encode("utf-8"))
369         textcontent = textparser.gettext()
370         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
371         msg.attach(textpart)
372         msg.attach(htmlpart)
373
374         # start by working out the filename we should be writting to, we do
375         # this following the normal maildir style rules
376         fname = str(os.getpid()) \
377             + "." + socket.gethostname() \
378             + "." + "".join( \
379                 [random.choice( \
380                     string.ascii_letters + string.digits \
381                     ) for a in range(0,10) \
382                 ]) + "." \
383             + datetime.datetime.now().strftime('%s')
384         fn = os.path.join(maildir, "tmp", fname)
385         fh = open(fn, "w")
386         fh.write(msg.as_string())
387         fh.close()
388         # now move it in to the new directory
389         newfn = os.path.join(maildir, "new", fname)
390         os.link(fn, newfn)
391         os.unlink(fn)
392
393         # now add to the database about the item
394         if prevmessageid:
395             messageid = prevmessageid + " " + messageid
396         data = urllib.urlencode((
397             ("message-id", messageid), \
398             ("created", createddate), \
399             ("contentmd5", md5sum) \
400             ))
401         db[url + "|" + item["link"]] = data
402
403     if headers:
404         data = []
405         for header in headers:
406             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
407                 data.append((header[0], header[1]))
408         if len(data) > 0:
409             data = urllib.urlencode(data)
410             feeddb[url] = data
411
412     db.close()
413     feeddb.close()
414
415 # first off, parse the command line arguments
416
417 oparser = OptionParser()
418 oparser.add_option(
419     "-c", "--conf", dest="conf",
420     help="location of config file"
421     )
422 oparser.add_option(
423     "-s", "--statedir", dest="statedir",
424     help="location of directory to store state in"
425     )
426
427 (options, args) = oparser.parse_args()
428
429 # check for the configfile
430
431 configfile = None
432
433 if options.conf != None:
434     # does the file exist?
435     try:
436         os.stat(options.conf)
437         configfile = options.conf
438     except:
439         # should exit here as the specified file doesn't exist
440         sys.stderr.write( \
441             "Config file %s does not exist. Exiting.\n" %(options.conf,))
442         sys.exit(2)
443 else:
444     # check through the default locations
445     try:
446         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
447         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
448     except:
449         try:
450             os.stat("/etc/rss2maildir.conf")
451             configfile = "/etc/rss2maildir.conf"
452         except:
453             sys.stderr.write("No config file found. Exiting.\n")
454             sys.exit(2)
455
456 # Right - if we've got this far, we've got a config file, now for the hard
457 # bits...
458
459 scp = SafeConfigParser()
460 scp.read(configfile)
461
462 maildir_root = "RSSMaildir"
463 state_dir = "state"
464
465 if options.statedir != None:
466     state_dir = options.statedir
467     try:
468         mode = os.stat(state_dir)[stat.ST_MODE]
469         if not stat.S_ISDIR(mode):
470             sys.stderr.write( \
471                 "State directory (%s) is not a directory\n" %(state_dir))
472             sys.exit(1)
473     except:
474         # try to make the directory
475         try:
476             os.mkdir(state_dir)
477         except:
478             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
479             sys.exit(1)
480 elif scp.has_option("general", "state_dir"):
481     new_state_dir = scp.get("general", "state_dir")
482     try:
483         mode = os.stat(state_dir)[stat.ST_MODE]
484         if not stat.S_ISDIR(mode):
485             sys.stderr.write( \
486                 "State directory (%s) is not a directory\n" %(state_dir))
487             sys.exit(1)
488     except:
489         # try to create it
490         try:
491             os.mkdir(new_state_dir)
492             state_dir = new_state_dir
493         except:
494             sys.stderr.write( \
495                 "Couldn't create state directory %s\n" %(new_state_dir))
496             sys.exit(1)
497 else:
498     try:
499         mode = os.stat(state_dir)[stat.ST_MODE]
500         if not stat.S_ISDIR(mode):
501             sys.stderr.write( \
502                 "State directory %s is not a directory\n" %(state_dir))
503             sys.exit(1)
504     except:
505         try:
506             os.mkdir(state_dir)
507         except:
508             sys.stderr.write( \
509                 "State directory %s could not be created\n" %(state_dir))
510             sys.exit(1)
511
512 if scp.has_option("general", "maildir_root"):
513     maildir_root = scp.get("general", "maildir_root")
514
515 try:
516     mode = os.stat(maildir_root)[stat.ST_MODE]
517     if not stat.S_ISDIR(mode):
518         sys.stderr.write( \
519             "Maildir Root %s is not a directory\n" \
520             %(maildir_root))
521         sys.exit(1)
522 except:
523     try:
524         os.mkdir(maildir_root)
525     except:
526         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
527         sys.exit(1)
528
529 feeds = scp.sections()
530 try:
531     feeds.remove("general")
532 except:
533     pass
534
535 for section in feeds:
536     # check if the directory exists
537     maildir = None
538     try:
539         maildir = scp.get(section, "maildir")
540     except:
541         maildir = section
542
543     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
544     maildir = os.path.join(maildir_root, maildir)
545
546     try:
547         exists = os.stat(maildir)
548         if stat.S_ISDIR(exists[stat.ST_MODE]):
549             # check if there's a new, cur and tmp directory
550             try:
551                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
552             except:
553                 os.mkdir(os.path.join(maildir, "cur"))
554                 if not stat.S_ISDIR(mode):
555                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
556             try:
557                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
558             except:
559                 os.mkdir(os.path.join(maildir, "tmp"))
560                 if not stat.S_ISDIR(mode):
561                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
562             try:
563                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
564                 if not stat.S_ISDIR(mode):
565                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
566             except:
567                 os.mkdir(os.path.join(maildir, "new"))
568         else:
569             sys.stderr.write("Broken maildir: %s\n" %(maildir))
570     except:
571         try:
572             os.mkdir(maildir)
573         except:
574             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
575             sys.exit(1)
576         try:
577             os.mkdir(os.path.join(maildir, "new"))
578             os.mkdir(os.path.join(maildir, "cur"))
579             os.mkdir(os.path.join(maildir, "tmp"))
580         except:
581             sys.stderr.write( \
582                 "Couldn't create required maildir directories for %s\n" \
583                 %(section,))
584             sys.exit(1)
585
586     # right - we've got the directories, we've got the section, we know the
587     # url... lets play!
588
589     parse_and_deliver(maildir, section, state_dir)