* updated posts are now "threaded" - adds a References header with the previous
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import urllib
24
25 import feedparser
26
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
29
30 import datetime
31 import random
32 import string
33 import textwrap
34
35 import socket
36
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
39
40 from base64 import b64encode
41 import md5
42
43 import cgi
44 import dbm
45
46 from HTMLParser import HTMLParser
47
48 entities = {
49     "amp": "&",
50     "lt": "<",
51     "gt": ">",
52     "pound": "£",
53     "copy": "©",
54     "apos": "'",
55     "quote": "\"",
56     "nbsp": " ",
57     }
58
59 class HTML2Text(HTMLParser):
60     
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             if self.inparagraph:
92                 self.text = self.text \
93                     + u'\n'.join( \
94                         textwrap.wrap(self.currentparagraph, 70)) \
95                     + u'\n'
96                 self.currentparagraph = ""
97             elif self.inblockquote:
98                 self.text = self.text \
99                     + u'\n> ' \
100                     + u'\n> '.join( \
101                         [a.strip() \
102                             for a in textwrap.wrap(self.blockquote, 68) \
103                         ]) \
104                     + u'\n'
105                 self.blockquote = u''
106             else:
107                 self.text = self.text + u'\n'
108         elif tag.lower() == "blockquote":
109             self.inblockquote = True
110             self.text = self.text + u'\n'
111         elif tag.lower() == "p":
112             if self.text != "":
113                 self.text = self.text + u'\n\n'
114             if self.inparagraph:
115                 self.text = self.text \
116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117             self.currentparagraph = u''
118             self.inparagraph = True
119         elif tag.lower() == "pre":
120             self.text = self.text + "\n"
121             self.inpre = True
122             self.inparagraph = False
123             self.inblockquote = False
124         elif tag.lower() == "ul":
125             self.item = u''
126             self.inul = True
127             self.text = self.text + "\n"
128         elif tag.lower() == "li" and self.inul:
129             if not self.initem:
130                 self.initem = True
131                 self.item = u''
132             else:
133                 self.text = self.text \
134                     + u' * ' \
135                     + u'\n   '.join([a.strip() for a in \
136                         textwrap.wrap(self.item, 67)]) \
137                     + u'\n'
138                 self.item = u''
139
140     def handle_startendtag(self, tag, attrs):
141         if tag.lower() == "br":
142             if self.inparagraph:
143                 self.text = self.text \
144                 + u'\n'.join( \
145                     [a \
146                         for a in textwrap.wrap( \
147                             self.currentparagraph, 70) \
148                     ] \
149                 ) \
150                 + u'\n'
151                 self.currentparagraph = u''
152             elif self.inblockquote:
153                 self.text = self.text \
154                     + u'\n> ' \
155                     + u'\n> '.join( \
156                         [a \
157                             for a in textwrap.wrap( \
158                                 self.blockquote.encode("utf-8") \
159                                 , 68) \
160                         ] \
161                     ) \
162                     + u'\n'
163                 self.blockquote = u''
164             else:
165                 self.text = self.text + "\n"
166
167     def handle_endtag(self, tag):
168         if tag.lower() == "h1":
169             self.inheadingone = False
170             self.text = self.text \
171                 + u'\n\n' \
172                 + self.headingtext.encode("utf-8") \
173                 + u'\n' \
174                 + u'=' * len(self.headingtext.encode("utf-8").strip())
175             self.headingtext = u''
176         elif tag.lower() == "h2":
177             self.inheadingtwo = False
178             self.text = self.text \
179                 + u'\n\n' \
180                 + self.headingtext.encode("utf-8") \
181                 + u'\n' \
182                 + u'-' * len(self.headingtext.encode("utf-8").strip())
183             self.headingtext = u''
184         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
185             self.inotherheading = False
186             self.text = self.text \
187                 + u'\n\n' \
188                 + self.headingtext.encode("utf-8") \
189                 + u'\n' \
190                 + u'~' * len(self.headingtext.encode("utf-8").strip())
191             self.headingtext = u''
192         elif tag.lower() == "p":
193             self.text = self.text \
194                 + u'\n'.join(textwrap.wrap( \
195                     self.currentparagraph, 70) \
196                 )
197             self.inparagraph = False
198             self.currentparagraph = u''
199         elif tag.lower() == "blockquote":
200             self.text = self.text \
201                 + u'\n> ' \
202                 + u'\n> '.join( \
203                     [a.strip() \
204                         for a in textwrap.wrap( \
205                             self.blockquote, 68)] \
206                     ) \
207                 + u'\n'
208             self.inblockquote = False
209             self.blockquote = u''
210         elif tag.lower() == "pre":
211             self.inpre = False
212         elif tag.lower() == "li":
213             self.initem = False
214             if self.item != "":
215                 self.text = self.text \
216                     + u' * ' \
217                     + u'\n   '.join( \
218                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
219                     + u'\n'
220             self.item = u''
221         elif tag.lower() == "ul":
222             self.inul = False
223
224     def handle_data(self, data):
225         if self.inheadingone or self.inheadingtwo or self.inotherheading:
226             self.headingtext = self.headingtext \
227                 + unicode(data, "utf-8").strip() \
228                 + u' '
229         elif self.inblockquote:
230             self.blockquote = self.blockquote \
231                 + unicode(data, "utf-8").strip() \
232                 + u' '
233         elif self.inparagraph:
234             self.currentparagraph = self.currentparagraph \
235                 + unicode(data, "utf-8").strip() \
236                 + u' '
237         elif self.inul and self.initem:
238             self.item = self.item + unicode(data, "utf-8")
239         elif self.inpre:
240             self.text = self.text + unicode(data, "utf-8")
241         else:
242             self.text = self.text + unicode(data, "utf-8").strip() + u' '
243
244     def handle_entityref(self, name):
245         entity = name
246         if entities.has_key(name.lower()):
247             entity = entities[name.lower()]
248         elif name[0] == "#":
249             entity = unichr(int(name[1:]))
250         else:
251             entity = "&" + name + ";"
252
253         if self.inparagraph:
254             self.currentparagraph = self.currentparagraph \
255                 + unicode(entity, "utf-8")
256         elif self.inblockquote:
257             self.blockquote = self.blockquote + unicode(entity, "utf-8")
258         else:
259             self.text = self.text + unicode(entity, "utf-8")
260
261     def gettext(self):
262         data = self.text
263         if self.inparagraph:
264             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
265         return data
266
267 def parse_and_deliver(maildir, url, statedir):
268     fp = feedparser.parse(url)
269     db = dbm.open(os.path.join(statedir, "seen"), "c")
270     for item in fp["items"]:
271         # have we seen it before?
272         # need to work out what the content is first...
273
274         if item.has_key("content"):
275             content = item["content"][0]["value"]
276         else:
277             content = item["summary"]
278
279         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
280
281         prevmessageid = None
282
283         if db.has_key(url + "|" + item["link"]):
284             data = db[url + "|" + item["link"]]
285             data = cgi.parse_qs(data)
286             if data.has_key("message-id"):
287                 prevmessageid = data["message-id"][0]
288             if data["contentmd5"][0] == md5sum:
289                 continue
290
291         try:
292             author = item["author"]
293         except:
294             author = url
295
296         # create a basic email message
297         msg = MIMEMultipart("alternative")
298         messageid = "<" \
299             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
300             + "." \
301             + "".join( \
302                 [random.choice( \
303                     string.ascii_letters + string.digits \
304                     ) for a in range(0,6) \
305                 ]) + "@" + socket.gethostname() + ">"
306         msg.add_header("Message-ID", messageid)
307         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
308         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
309         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
310         if prevmessageid:
311             msg.add_header("References", prevmessageid)
312         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
313             .strftime("%a, %e %b %Y %T -0000")
314         msg.add_header("Date", createddate)
315         msg.add_header("Subject", item["title"])
316         msg.set_default_type("text/plain")
317
318         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
319         textparser = HTML2Text()
320         textparser.feed(content.encode("utf-8"))
321         textcontent = textparser.gettext()
322         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
323         msg.attach(textpart)
324         msg.attach(htmlpart)
325
326         # start by working out the filename we should be writting to, we do
327         # this following the normal maildir style rules
328         fname = str(os.getpid()) \
329             + "." + socket.gethostname() \
330             + "." + "".join( \
331                 [random.choice( \
332                     string.ascii_letters + string.digits \
333                     ) for a in range(0,10) \
334                 ]) + "." \
335             + datetime.datetime.now().strftime('%s')
336         fn = os.path.join(maildir, "tmp", fname)
337         fh = open(fn, "w")
338         fh.write(msg.as_string())
339         fh.close()
340         # now move it in to the new directory
341         newfn = os.path.join(maildir, "new", fname)
342         os.link(fn, newfn)
343         os.unlink(fn)
344
345         # now add to the database about the item
346         if prevmessageid:
347             messageid = prevmessageid + " " + messageid
348         data = urllib.urlencode((
349             ("message-id", messageid), \
350             ("created", createddate), \
351             ("contentmd5", md5sum) \
352             ))
353         db[url + "|" + item["link"]] = data
354
355     db.close()
356
357 # first off, parse the command line arguments
358
359 oparser = OptionParser()
360 oparser.add_option(
361     "-c", "--conf", dest="conf",
362     help="location of config file"
363     )
364 oparser.add_option(
365     "-s", "--statedir", dest="statedir",
366     help="location of directory to store state in"
367     )
368
369 (options, args) = oparser.parse_args()
370
371 # check for the configfile
372
373 configfile = None
374
375 if options.conf != None:
376     # does the file exist?
377     try:
378         os.stat(options.conf)
379         configfile = options.conf
380     except:
381         # should exit here as the specified file doesn't exist
382         sys.stderr.write( \
383             "Config file %s does not exist. Exiting.\n" %(options.conf,))
384         sys.exit(2)
385 else:
386     # check through the default locations
387     try:
388         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
389         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
390     except:
391         try:
392             os.stat("/etc/rss2maildir.conf")
393             configfile = "/etc/rss2maildir.conf"
394         except:
395             sys.stderr.write("No config file found. Exiting.\n")
396             sys.exit(2)
397
398 # Right - if we've got this far, we've got a config file, now for the hard
399 # bits...
400
401 scp = SafeConfigParser()
402 scp.read(configfile)
403
404 maildir_root = "RSSMaildir"
405 state_dir = "state"
406
407 if options.statedir != None:
408     state_dir = options.statedir
409     try:
410         mode = os.stat(state_dir)[stat.ST_MODE]
411         if not stat.S_ISDIR(mode):
412             sys.stderr.write( \
413                 "State directory (%s) is not a directory\n" %(state_dir))
414             sys.exit(1)
415     except:
416         # try to make the directory
417         try:
418             os.mkdir(state_dir)
419         except:
420             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
421             sys.exit(1)
422 elif scp.has_option("general", "state_dir"):
423     new_state_dir = scp.get("general", "state_dir")
424     try:
425         mode = os.stat(state_dir)[stat.ST_MODE]
426         if not stat.S_ISDIR(mode):
427             sys.stderr.write( \
428                 "State directory (%s) is not a directory\n" %(state_dir))
429             sys.exit(1)
430     except:
431         # try to create it
432         try:
433             os.mkdir(new_state_dir)
434             state_dir = new_state_dir
435         except:
436             sys.stderr.write( \
437                 "Couldn't create state directory %s\n" %(new_state_dir))
438             sys.exit(1)
439 else:
440     try:
441         mode = os.stat(state_dir)[stat.ST_MODE]
442         if not stat.S_ISDIR(mode):
443             sys.stderr.write( \
444                 "State directory %s is not a directory\n" %(state_dir))
445             sys.exit(1)
446     except:
447         try:
448             os.mkdir(state_dir)
449         except:
450             sys.stderr.write( \
451                 "State directory %s could not be created\n" %(state_dir))
452             sys.exit(1)
453
454 if scp.has_option("general", "maildir_root"):
455     maildir_root = scp.get("general", "maildir_root")
456
457 try:
458     mode = os.stat(maildir_root)[stat.ST_MODE]
459     if not stat.S_ISDIR(mode):
460         sys.stderr.write( \
461             "Maildir Root %s is not a directory\n" \
462             %(maildir_root))
463         sys.exit(1)
464 except:
465     try:
466         os.mkdir(maildir_root)
467     except:
468         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
469         sys.exit(1)
470
471 feeds = scp.sections()
472 try:
473     feeds.remove("general")
474 except:
475     pass
476
477 for section in feeds:
478     # check if the directory exists
479     maildir = None
480     try:
481         maildir = scp.get(section, "maildir")
482     except:
483         maildir = section
484
485     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
486     maildir = os.path.join(maildir_root, maildir)
487
488     try:
489         exists = os.stat(maildir)
490         if stat.S_ISDIR(exists[stat.ST_MODE]):
491             # check if there's a new, cur and tmp directory
492             try:
493                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
494             except:
495                 os.mkdir(os.path.join(maildir, "cur"))
496                 if not stat.S_ISDIR(mode):
497                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
498             try:
499                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
500             except:
501                 os.mkdir(os.path.join(maildir, "tmp"))
502                 if not stat.S_ISDIR(mode):
503                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
504             try:
505                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
506                 if not stat.S_ISDIR(mode):
507                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
508             except:
509                 os.mkdir(os.path.join(maildir, "new"))
510         else:
511             sys.stderr.write("Broken maildir: %s\n" %(maildir))
512     except:
513         try:
514             os.mkdir(maildir)
515         except:
516             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
517             sys.exit(1)
518         try:
519             os.mkdir(os.path.join(maildir, "new"))
520             os.mkdir(os.path.join(maildir, "cur"))
521             os.mkdir(os.path.join(maildir, "tmp"))
522         except:
523             sys.stderr.write( \
524                 "Couldn't create required maildir directories for %s\n" \
525                 %(section,))
526             sys.exit(1)
527
528     # right - we've got the directories, we've got the section, we know the
529     # url... lets play!
530
531     parse_and_deliver(maildir, section, state_dir)