Further reformatting to < 80 chars per line
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import urllib
24
25 import feedparser
26
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
29
30 import datetime
31 import random
32 import string
33 import textwrap
34
35 import socket
36
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
39
40 from base64 import b64encode
41 import md5
42
43 import cgi
44 import dbm
45
46 from HTMLParser import HTMLParser
47
48 entities = {
49     "amp": "&",
50     "lt": "<",
51     "gt": ">",
52     "pound": "£",
53     "copy": "©",
54     "apos": "'",
55     "quote": "\"",
56     "nbsp": " ",
57     }
58
59 class HTML2Text(HTMLParser):
60     
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             if self.inparagraph:
92                 self.text = self.text \
93                     + u'\n'.join( \
94                         textwrap.wrap(self.currentparagraph, 70)) \
95                     + u'\n'
96                 self.currentparagraph = ""
97             elif self.inblockquote:
98                 self.text = self.text \
99                     + u'\n> ' \
100                     + u'\n> '.join( \
101                         [a.strip() \
102                             for a in textwrap.wrap(self.blockquote, 68) \
103                         ]) \
104                     + u'\n'
105                 self.blockquote = u''
106             else:
107                 self.text = self.text + u'\n'
108         elif tag.lower() == "blockquote":
109             self.inblockquote = True
110             self.text = self.text + u'\n'
111         elif tag.lower() == "p":
112             if self.text != "":
113                 self.text = self.text + u'\n\n'
114             if self.inparagraph:
115                 self.text = self.text \
116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117             self.currentparagraph = u''
118             self.inparagraph = True
119         elif tag.lower() == "pre":
120             self.text = self.text + "\n"
121             self.inpre = True
122             self.inparagraph = False
123             self.inblockquote = False
124         elif tag.lower() == "ul":
125             self.item = u''
126             self.inul = True
127             self.text = self.text + "\n"
128         elif tag.lower() == "li" and self.inul:
129             if not self.initem:
130                 self.initem = True
131                 self.item = u''
132             else:
133                 self.text = self.text \
134                     + u' * ' \
135                     + u'\n   '.join([a.strip() for a in \
136                         textwrap.wrap(self.item, 67)]) \
137                     + u'\n'
138                 self.item = u''
139
140     def handle_startendtag(self, tag, attrs):
141         if tag.lower() == "br":
142             if self.inparagraph:
143                 self.text = self.text \
144                 + u'\n'.join( \
145                     [a \
146                         for a in textwrap.wrap( \
147                             self.currentparagraph, 70) \
148                     ] \
149                 ) \
150                 + u'\n'
151                 self.currentparagraph = u''
152             elif self.inblockquote:
153                 self.text = self.text \
154                     + u'\n> ' \
155                     + u'\n> '.join( \
156                         [a \
157                             for a in textwrap.wrap( \
158                                 self.blockquote.encode("utf-8") \
159                                 , 68) \
160                         ] \
161                     ) \
162                     + u'\n'
163                 self.blockquote = u''
164             else:
165                 self.text = self.text + "\n"
166
167     def handle_endtag(self, tag):
168         if tag.lower() == "h1":
169             self.inheadingone = False
170             self.text = self.text \
171                 + u'\n\n' \
172                 + self.headingtext.encode("utf-8") \
173                 + u'\n' \
174                 + u'=' * len(self.headingtext.encode("utf-8").strip())
175             self.headingtext = u''
176         elif tag.lower() == "h2":
177             self.inheadingtwo = False
178             self.text = self.text \
179                 + u'\n\n' \
180                 + self.headingtext.encode("utf-8") \
181                 + u'\n' \
182                 + u'-' * len(self.headingtext.encode("utf-8").strip())
183             self.headingtext = u''
184         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
185             self.inotherheading = False
186             self.text = self.text \
187                 + u'\n\n' \
188                 + self.headingtext.encode("utf-8") \
189                 + u'\n' \
190                 + u'~' * len(self.headingtext.encode("utf-8").strip())
191             self.headingtext = u''
192         elif tag.lower() == "p":
193             self.text = self.text \
194                 + u'\n'.join(textwrap.wrap( \
195                     self.currentparagraph, 70) \
196                 )
197             self.inparagraph = False
198             self.currentparagraph = u''
199         elif tag.lower() == "blockquote":
200             self.text = self.text \
201                 + u'\n> ' \
202                 + u'\n> '.join( \
203                     [a.strip() \
204                         for a in textwrap.wrap( \
205                             self.blockquote, 68)] \
206                     ) \
207                 + u'\n'
208             self.inblockquote = False
209             self.blockquote = u''
210         elif tag.lower() == "pre":
211             self.inpre = False
212         elif tag.lower() == "li":
213             self.initem = False
214             if self.item != "":
215                 self.text = self.text \
216                     + u' * ' \
217                     + u'\n   '.join( \
218                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
219                     + u'\n'
220             self.item = u''
221         elif tag.lower() == "ul":
222             self.inul = False
223
224     def handle_data(self, data):
225         if self.inheadingone or self.inheadingtwo or self.inotherheading:
226             self.headingtext = self.headingtext \
227                 + unicode(data, "utf-8").strip() \
228                 + u' '
229         elif self.inblockquote:
230             self.blockquote = self.blockquote \
231                 + unicode(data, "utf-8").strip() \
232                 + u' '
233         elif self.inparagraph:
234             self.currentparagraph = self.currentparagraph \
235                 + unicode(data, "utf-8").strip() \
236                 + u' '
237         elif self.inul and self.initem:
238             self.item = self.item + unicode(data, "utf-8")
239         elif self.inpre:
240             self.text = self.text + unicode(data, "utf-8")
241         else:
242             self.text = self.text + unicode(data, "utf-8").strip() + u' '
243
244     def handle_entityref(self, name):
245         entity = name
246         if entities.has_key(name.lower()):
247             entity = entities[name.lower()]
248         elif name[0] == "#":
249             entity = unichr(int(name[1:]))
250         else:
251             entity = "&" + name + ";"
252
253         if self.inparagraph:
254             self.currentparagraph = self.currentparagraph \
255                 + unicode(entity, "utf-8")
256         elif self.inblockquote:
257             self.blockquote = self.blockquote + unicode(entity, "utf-8")
258         else:
259             self.text = self.text + unicode(entity, "utf-8")
260
261     def gettext(self):
262         data = self.text
263         if self.inparagraph:
264             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
265         return data
266
267 def parse_and_deliver(maildir, url, statedir):
268     fp = feedparser.parse(url)
269     db = dbm.open(os.path.join(statedir, "seen"), "c")
270     for item in fp["items"]:
271         # have we seen it before?
272         # need to work out what the content is first...
273
274         if item.has_key("content"):
275             content = item["content"][0]["value"]
276         else:
277             content = item["summary"]
278
279         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
280
281         if db.has_key(url + "|" + item["link"]):
282             data = db[url + "|" + item["link"]]
283             data = cgi.parse_qs(data)
284             if data["contentmd5"][0] == md5sum:
285                 continue
286
287         try:
288             author = item["author"]
289         except:
290             author = url
291
292         # create a basic email message
293         msg = MIMEMultipart("alternative")
294         messageid = "<" \
295             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
296             + "." \
297             + "".join( \
298                 [random.choice( \
299                     string.ascii_letters + string.digits \
300                     ) for a in range(0,6) \
301                 ]) + "@" + socket.gethostname() + ">"
302         msg.add_header("Message-ID", messageid)
303         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
304         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
305         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
306         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
307             .strftime("%a, %e %b %Y %T -0000")
308         msg.add_header("Date", createddate)
309         msg.add_header("Subject", item["title"])
310         msg.set_default_type("text/plain")
311
312         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
313         textparser = HTML2Text()
314         textparser.feed(content.encode("utf-8"))
315         textcontent = textparser.gettext()
316         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
317         msg.attach(textpart)
318         msg.attach(htmlpart)
319
320         # start by working out the filename we should be writting to, we do
321         # this following the normal maildir style rules
322         fname = str(os.getpid()) \
323             + "." + socket.gethostname() \
324             + "." + "".join( \
325                 [random.choice( \
326                     string.ascii_letters + string.digits \
327                     ) for a in range(0,10) \
328                 ]) + "." \
329             + datetime.datetime.now().strftime('%s')
330         fn = os.path.join(maildir, "tmp", fname)
331         fh = open(fn, "w")
332         fh.write(msg.as_string())
333         fh.close()
334         # now move it in to the new directory
335         newfn = os.path.join(maildir, "new", fname)
336         os.link(fn, newfn)
337         os.unlink(fn)
338
339         # now add to the database about the item
340         data = urllib.urlencode((
341             ("message-id", messageid), \
342             ("created", createddate), \
343             ("contentmd5", md5sum) \
344             ))
345         db[url + "|" + item["link"]] = data
346
347     db.close()
348
349 # first off, parse the command line arguments
350
351 oparser = OptionParser()
352 oparser.add_option(
353     "-c", "--conf", dest="conf",
354     help="location of config file"
355     )
356 oparser.add_option(
357     "-s", "--statedir", dest="statedir",
358     help="location of directory to store state in"
359     )
360
361 (options, args) = oparser.parse_args()
362
363 # check for the configfile
364
365 configfile = None
366
367 if options.conf != None:
368     # does the file exist?
369     try:
370         os.stat(options.conf)
371         configfile = options.conf
372     except:
373         # should exit here as the specified file doesn't exist
374         sys.stderr.write( \
375             "Config file %s does not exist. Exiting.\n" %(options.conf,))
376         sys.exit(2)
377 else:
378     # check through the default locations
379     try:
380         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
381         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
382     except:
383         try:
384             os.stat("/etc/rss2maildir.conf")
385             configfile = "/etc/rss2maildir.conf"
386         except:
387             sys.stderr.write("No config file found. Exiting.\n")
388             sys.exit(2)
389
390 # Right - if we've got this far, we've got a config file, now for the hard
391 # bits...
392
393 scp = SafeConfigParser()
394 scp.read(configfile)
395
396 maildir_root = "RSSMaildir"
397 state_dir = "state"
398
399 if options.statedir != None:
400     state_dir = options.statedir
401     try:
402         mode = os.stat(state_dir)[stat.ST_MODE]
403         if not stat.S_ISDIR(mode):
404             sys.stderr.write( \
405                 "State directory (%s) is not a directory\n" %(state_dir))
406             sys.exit(1)
407     except:
408         # try to make the directory
409         try:
410             os.mkdir(state_dir)
411         except:
412             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
413             sys.exit(1)
414 elif scp.has_option("general", "state_dir"):
415     new_state_dir = scp.get("general", "state_dir")
416     try:
417         mode = os.stat(state_dir)[stat.ST_MODE]
418         if not stat.S_ISDIR(mode):
419             sys.stderr.write( \
420                 "State directory (%s) is not a directory\n" %(state_dir))
421             sys.exit(1)
422     except:
423         # try to create it
424         try:
425             os.mkdir(new_state_dir)
426             state_dir = new_state_dir
427         except:
428             sys.stderr.write( \
429                 "Couldn't create state directory %s\n" %(new_state_dir))
430             sys.exit(1)
431 else:
432     try:
433         mode = os.stat(state_dir)[stat.ST_MODE]
434         if not stat.S_ISDIR(mode):
435             sys.stderr.write( \
436                 "State directory %s is not a directory\n" %(state_dir))
437             sys.exit(1)
438     except:
439         try:
440             os.mkdir(state_dir)
441         except:
442             sys.stderr.write( \
443                 "State directory %s could not be created\n" %(state_dir))
444             sys.exit(1)
445
446 if scp.has_option("general", "maildir_root"):
447     maildir_root = scp.get("general", "maildir_root")
448
449 try:
450     mode = os.stat(maildir_root)[stat.ST_MODE]
451     if not stat.S_ISDIR(mode):
452         sys.stderr.write( \
453             "Maildir Root %s is not a directory\n" \
454             %(maildir_root))
455         sys.exit(1)
456 except:
457     try:
458         os.mkdir(maildir_root)
459     except:
460         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
461         sys.exit(1)
462
463 feeds = scp.sections()
464 try:
465     feeds.remove("general")
466 except:
467     pass
468
469 for section in feeds:
470     # check if the directory exists
471     maildir = None
472     try:
473         maildir = scp.get(section, "maildir")
474     except:
475         maildir = section
476
477     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
478     maildir = os.path.join(maildir_root, maildir)
479
480     try:
481         exists = os.stat(maildir)
482         if stat.S_ISDIR(exists[stat.ST_MODE]):
483             # check if there's a new, cur and tmp directory
484             try:
485                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
486             except:
487                 os.mkdir(os.path.join(maildir, "cur"))
488                 if not stat.S_ISDIR(mode):
489                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
490             try:
491                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
492             except:
493                 os.mkdir(os.path.join(maildir, "tmp"))
494                 if not stat.S_ISDIR(mode):
495                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
496             try:
497                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
498                 if not stat.S_ISDIR(mode):
499                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
500             except:
501                 os.mkdir(os.path.join(maildir, "new"))
502         else:
503             sys.stderr.write("Broken maildir: %s\n" %(maildir))
504     except:
505         try:
506             os.mkdir(maildir)
507         except:
508             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
509             sys.exit(1)
510         try:
511             os.mkdir(os.path.join(maildir, "new"))
512             os.mkdir(os.path.join(maildir, "cur"))
513             os.mkdir(os.path.join(maildir, "tmp"))
514         except:
515             sys.stderr.write( \
516                 "Couldn't create required maildir directories for %s\n" \
517                 %(section,))
518             sys.exit(1)
519
520     # right - we've got the directories, we've got the section, we know the
521     # url... lets play!
522
523     parse_and_deliver(maildir, section, state_dir)