Refactor <br /> handling code so that there's no duplication
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import urllib
24
25 import feedparser
26
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
29
30 import datetime
31 import random
32 import string
33 import textwrap
34
35 import socket
36
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
39
40 from base64 import b64encode
41 import md5
42
43 import cgi
44 import dbm
45
46 from HTMLParser import HTMLParser
47
48 entities = {
49     "amp": "&",
50     "lt": "<",
51     "gt": ">",
52     "pound": "£",
53     "copy": "©",
54     "apos": "'",
55     "quote": "\"",
56     "nbsp": " ",
57     }
58
59 class HTML2Text(HTMLParser):
60     
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             self.handle_br()
92         elif tag.lower() == "blockquote":
93             self.inblockquote = True
94             self.text = self.text + u'\n'
95         elif tag.lower() == "p":
96             if self.text != "":
97                 self.text = self.text + u'\n\n'
98             if self.inparagraph:
99                 self.text = self.text \
100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101             self.currentparagraph = u''
102             self.inparagraph = True
103         elif tag.lower() == "pre":
104             self.text = self.text + "\n"
105             self.inpre = True
106             self.inparagraph = False
107             self.inblockquote = False
108         elif tag.lower() == "ul":
109             self.item = u''
110             self.inul = True
111             self.text = self.text + "\n"
112         elif tag.lower() == "li" and self.inul:
113             if not self.initem:
114                 self.initem = True
115                 self.item = u''
116             else:
117                 self.text = self.text \
118                     + u' * ' \
119                     + u'\n   '.join([a.strip() for a in \
120                         textwrap.wrap(self.item, 67)]) \
121                     + u'\n'
122                 self.item = u''
123
124     def handle_startendtag(self, tag, attrs):
125         if tag.lower() == "br":
126             self.handle_br()
127
128     def handle_br(self):
129             if self.inparagraph:
130                 self.text = self.text \
131                 + u'\n'.join( \
132                     [a \
133                         for a in textwrap.wrap( \
134                             self.currentparagraph, 70) \
135                     ] \
136                 ) \
137                 + u'\n'
138                 self.currentparagraph = u''
139             elif self.inblockquote:
140                 self.text = self.text \
141                     + u'\n> ' \
142                     + u'\n> '.join( \
143                         [a \
144                             for a in textwrap.wrap( \
145                                 self.blockquote.encode("utf-8") \
146                                 , 68) \
147                         ] \
148                     ) \
149                     + u'\n'
150                 self.blockquote = u''
151             else:
152                 self.text = self.text + "\n"
153
154     def handle_endtag(self, tag):
155         if tag.lower() == "h1":
156             self.inheadingone = False
157             self.text = self.text \
158                 + u'\n\n' \
159                 + self.headingtext.encode("utf-8") \
160                 + u'\n' \
161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
162             self.headingtext = u''
163         elif tag.lower() == "h2":
164             self.inheadingtwo = False
165             self.text = self.text \
166                 + u'\n\n' \
167                 + self.headingtext.encode("utf-8") \
168                 + u'\n' \
169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
170             self.headingtext = u''
171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
172             self.inotherheading = False
173             self.text = self.text \
174                 + u'\n\n' \
175                 + self.headingtext.encode("utf-8") \
176                 + u'\n' \
177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
178             self.headingtext = u''
179         elif tag.lower() == "p":
180             self.text = self.text \
181                 + u'\n'.join(textwrap.wrap( \
182                     self.currentparagraph, 70) \
183                 )
184             self.inparagraph = False
185             self.currentparagraph = u''
186         elif tag.lower() == "blockquote":
187             self.text = self.text \
188                 + u'\n> ' \
189                 + u'\n> '.join( \
190                     [a.strip() \
191                         for a in textwrap.wrap( \
192                             self.blockquote, 68)] \
193                     ) \
194                 + u'\n'
195             self.inblockquote = False
196             self.blockquote = u''
197         elif tag.lower() == "pre":
198             self.inpre = False
199         elif tag.lower() == "li":
200             self.initem = False
201             if self.item != "":
202                 self.text = self.text \
203                     + u' * ' \
204                     + u'\n   '.join( \
205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
206                     + u'\n'
207             self.item = u''
208         elif tag.lower() == "ul":
209             self.inul = False
210
211     def handle_data(self, data):
212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
213             self.headingtext = self.headingtext \
214                 + unicode(data, "utf-8").strip() \
215                 + u' '
216         elif self.inblockquote:
217             self.blockquote = self.blockquote \
218                 + unicode(data, "utf-8").strip() \
219                 + u' '
220         elif self.inparagraph:
221             self.currentparagraph = self.currentparagraph \
222                 + unicode(data, "utf-8").strip() \
223                 + u' '
224         elif self.inul and self.initem:
225             self.item = self.item + unicode(data, "utf-8")
226         elif self.inpre:
227             self.text = self.text + unicode(data, "utf-8")
228         else:
229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
230
231     def handle_entityref(self, name):
232         entity = name
233         if entities.has_key(name.lower()):
234             entity = entities[name.lower()]
235         elif name[0] == "#":
236             entity = unichr(int(name[1:]))
237         else:
238             entity = "&" + name + ";"
239
240         if self.inparagraph:
241             self.currentparagraph = self.currentparagraph \
242                 + unicode(entity, "utf-8")
243         elif self.inblockquote:
244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
245         else:
246             self.text = self.text + unicode(entity, "utf-8")
247
248     def gettext(self):
249         data = self.text
250         if self.inparagraph:
251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
252         return data
253
254 def parse_and_deliver(maildir, url, statedir):
255     fp = feedparser.parse(url)
256     db = dbm.open(os.path.join(statedir, "seen"), "c")
257     for item in fp["items"]:
258         # have we seen it before?
259         # need to work out what the content is first...
260
261         if item.has_key("content"):
262             content = item["content"][0]["value"]
263         else:
264             content = item["summary"]
265
266         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
267
268         prevmessageid = None
269
270         if db.has_key(url + "|" + item["link"]):
271             data = db[url + "|" + item["link"]]
272             data = cgi.parse_qs(data)
273             if data.has_key("message-id"):
274                 prevmessageid = data["message-id"][0]
275             if data["contentmd5"][0] == md5sum:
276                 continue
277
278         try:
279             author = item["author"]
280         except:
281             author = url
282
283         # create a basic email message
284         msg = MIMEMultipart("alternative")
285         messageid = "<" \
286             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
287             + "." \
288             + "".join( \
289                 [random.choice( \
290                     string.ascii_letters + string.digits \
291                     ) for a in range(0,6) \
292                 ]) + "@" + socket.gethostname() + ">"
293         msg.add_header("Message-ID", messageid)
294         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
295         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
296         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
297         if prevmessageid:
298             msg.add_header("References", prevmessageid)
299         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
300             .strftime("%a, %e %b %Y %T -0000")
301         msg.add_header("Date", createddate)
302         msg.add_header("Subject", item["title"])
303         msg.set_default_type("text/plain")
304
305         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
306         textparser = HTML2Text()
307         textparser.feed(content.encode("utf-8"))
308         textcontent = textparser.gettext()
309         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
310         msg.attach(textpart)
311         msg.attach(htmlpart)
312
313         # start by working out the filename we should be writting to, we do
314         # this following the normal maildir style rules
315         fname = str(os.getpid()) \
316             + "." + socket.gethostname() \
317             + "." + "".join( \
318                 [random.choice( \
319                     string.ascii_letters + string.digits \
320                     ) for a in range(0,10) \
321                 ]) + "." \
322             + datetime.datetime.now().strftime('%s')
323         fn = os.path.join(maildir, "tmp", fname)
324         fh = open(fn, "w")
325         fh.write(msg.as_string())
326         fh.close()
327         # now move it in to the new directory
328         newfn = os.path.join(maildir, "new", fname)
329         os.link(fn, newfn)
330         os.unlink(fn)
331
332         # now add to the database about the item
333         if prevmessageid:
334             messageid = prevmessageid + " " + messageid
335         data = urllib.urlencode((
336             ("message-id", messageid), \
337             ("created", createddate), \
338             ("contentmd5", md5sum) \
339             ))
340         db[url + "|" + item["link"]] = data
341
342     db.close()
343
344 # first off, parse the command line arguments
345
346 oparser = OptionParser()
347 oparser.add_option(
348     "-c", "--conf", dest="conf",
349     help="location of config file"
350     )
351 oparser.add_option(
352     "-s", "--statedir", dest="statedir",
353     help="location of directory to store state in"
354     )
355
356 (options, args) = oparser.parse_args()
357
358 # check for the configfile
359
360 configfile = None
361
362 if options.conf != None:
363     # does the file exist?
364     try:
365         os.stat(options.conf)
366         configfile = options.conf
367     except:
368         # should exit here as the specified file doesn't exist
369         sys.stderr.write( \
370             "Config file %s does not exist. Exiting.\n" %(options.conf,))
371         sys.exit(2)
372 else:
373     # check through the default locations
374     try:
375         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
376         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
377     except:
378         try:
379             os.stat("/etc/rss2maildir.conf")
380             configfile = "/etc/rss2maildir.conf"
381         except:
382             sys.stderr.write("No config file found. Exiting.\n")
383             sys.exit(2)
384
385 # Right - if we've got this far, we've got a config file, now for the hard
386 # bits...
387
388 scp = SafeConfigParser()
389 scp.read(configfile)
390
391 maildir_root = "RSSMaildir"
392 state_dir = "state"
393
394 if options.statedir != None:
395     state_dir = options.statedir
396     try:
397         mode = os.stat(state_dir)[stat.ST_MODE]
398         if not stat.S_ISDIR(mode):
399             sys.stderr.write( \
400                 "State directory (%s) is not a directory\n" %(state_dir))
401             sys.exit(1)
402     except:
403         # try to make the directory
404         try:
405             os.mkdir(state_dir)
406         except:
407             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
408             sys.exit(1)
409 elif scp.has_option("general", "state_dir"):
410     new_state_dir = scp.get("general", "state_dir")
411     try:
412         mode = os.stat(state_dir)[stat.ST_MODE]
413         if not stat.S_ISDIR(mode):
414             sys.stderr.write( \
415                 "State directory (%s) is not a directory\n" %(state_dir))
416             sys.exit(1)
417     except:
418         # try to create it
419         try:
420             os.mkdir(new_state_dir)
421             state_dir = new_state_dir
422         except:
423             sys.stderr.write( \
424                 "Couldn't create state directory %s\n" %(new_state_dir))
425             sys.exit(1)
426 else:
427     try:
428         mode = os.stat(state_dir)[stat.ST_MODE]
429         if not stat.S_ISDIR(mode):
430             sys.stderr.write( \
431                 "State directory %s is not a directory\n" %(state_dir))
432             sys.exit(1)
433     except:
434         try:
435             os.mkdir(state_dir)
436         except:
437             sys.stderr.write( \
438                 "State directory %s could not be created\n" %(state_dir))
439             sys.exit(1)
440
441 if scp.has_option("general", "maildir_root"):
442     maildir_root = scp.get("general", "maildir_root")
443
444 try:
445     mode = os.stat(maildir_root)[stat.ST_MODE]
446     if not stat.S_ISDIR(mode):
447         sys.stderr.write( \
448             "Maildir Root %s is not a directory\n" \
449             %(maildir_root))
450         sys.exit(1)
451 except:
452     try:
453         os.mkdir(maildir_root)
454     except:
455         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
456         sys.exit(1)
457
458 feeds = scp.sections()
459 try:
460     feeds.remove("general")
461 except:
462     pass
463
464 for section in feeds:
465     # check if the directory exists
466     maildir = None
467     try:
468         maildir = scp.get(section, "maildir")
469     except:
470         maildir = section
471
472     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
473     maildir = os.path.join(maildir_root, maildir)
474
475     try:
476         exists = os.stat(maildir)
477         if stat.S_ISDIR(exists[stat.ST_MODE]):
478             # check if there's a new, cur and tmp directory
479             try:
480                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
481             except:
482                 os.mkdir(os.path.join(maildir, "cur"))
483                 if not stat.S_ISDIR(mode):
484                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
485             try:
486                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
487             except:
488                 os.mkdir(os.path.join(maildir, "tmp"))
489                 if not stat.S_ISDIR(mode):
490                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
491             try:
492                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
493                 if not stat.S_ISDIR(mode):
494                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
495             except:
496                 os.mkdir(os.path.join(maildir, "new"))
497         else:
498             sys.stderr.write("Broken maildir: %s\n" %(maildir))
499     except:
500         try:
501             os.mkdir(maildir)
502         except:
503             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
504             sys.exit(1)
505         try:
506             os.mkdir(os.path.join(maildir, "new"))
507             os.mkdir(os.path.join(maildir, "cur"))
508             os.mkdir(os.path.join(maildir, "tmp"))
509         except:
510             sys.stderr.write( \
511                 "Couldn't create required maildir directories for %s\n" \
512                 %(section,))
513             sys.exit(1)
514
515     # right - we've got the directories, we've got the section, we know the
516     # url... lets play!
517
518     parse_and_deliver(maildir, section, state_dir)