* tidy code to be mostly < 80 chars per line
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import urllib
24
25 import feedparser
26
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
29
30 import datetime
31 import random
32 import string
33 import textwrap
34
35 import socket
36
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
39
40 from base64 import b64encode
41 import md5
42
43 import cgi
44 import dbm
45
46 from HTMLParser import HTMLParser
47
48 entities = {
49     "amp": "&",
50     "lt": "<",
51     "gt": ">",
52     "pound": "£",
53     "copy": "©",
54     "apos": "'",
55     "quote": "\"",
56     "nbsp": " ",
57     }
58
59 class HTML2Text(HTMLParser):
60     
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             if self.inparagraph:
92                 self.text = self.text \
93                     + u'\n'.join( \
94                         textwrap.wrap(self.currentparagraph, 70)) \
95                     + u'\n'
96                 self.currentparagraph = ""
97             elif self.inblockquote:
98                 self.text = self.text \
99                     + u'\n> ' \
100                     + u'\n> '.join( \
101                         [a.strip() \
102                             for a in textwrap.wrap(self.blockquote, 68) \
103                         ]) \
104                     + u'\n'
105                 self.blockquote = u''
106             else:
107                 self.text = self.text + u'\n'
108         elif tag.lower() == "blockquote":
109             self.inblockquote = True
110             self.text = self.text + u'\n'
111         elif tag.lower() == "p":
112             if self.text != "":
113                 self.text = self.text + u'\n\n'
114             if self.inparagraph:
115                 self.text = self.text \
116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117             self.currentparagraph = u''
118             self.inparagraph = True
119         elif tag.lower() == "pre":
120             self.text = self.text + "\n"
121             self.inpre = True
122             self.inparagraph = False
123             self.inblockquote = False
124         elif tag.lower() == "ul":
125             self.item = u''
126             self.inul = True
127             self.text = self.text + "\n"
128         elif tag.lower() == "li" and self.inul:
129             if not self.initem:
130                 self.initem = True
131                 self.item = u''
132             else:
133                 self.text = self.text \
134                     + u' * ' \
135                     + u'\n   '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
136                     + u'\n'
137                 self.item = u''
138
139     def handle_startendtag(self, tag, attrs):
140         if tag.lower() == "br":
141             if self.inparagraph:
142                 self.text = self.text \
143                 + u'\n'.join( \
144                     [a \
145                         for a in textwrap.wrap( \
146                             self.currentparagraph, 70) \
147                     ] \
148                 ) \
149                 + u'\n'
150                 self.currentparagraph = u''
151             elif self.inblockquote:
152                 self.text = self.text \
153                     + u'\n> ' \
154                     + u'\n> '.join( \
155                         [a \
156                             for a in textwrap.wrap( \
157                                 self.blockquote.encode("utf-8") \
158                                 , 68) \
159                         ] \
160                     ) \
161                     + u'\n'
162                 self.blockquote = u''
163             else:
164                 self.text = self.text + "\n"
165
166     def handle_endtag(self, tag):
167         if tag.lower() == "h1":
168             self.inheadingone = False
169             self.text = self.text \
170                 + u'\n\n' \
171                 + self.headingtext.encode("utf-8") \
172                 + u'\n' \
173                 + u'=' * len(self.headingtext.encode("utf-8").strip())
174             self.headingtext = u''
175         elif tag.lower() == "h2":
176             self.inheadingtwo = False
177             self.text = self.text \
178                 + u'\n\n' \
179                 + self.headingtext.encode("utf-8") \
180                 + u'\n' \
181                 + u'-' * len(self.headingtext.encode("utf-8").strip())
182             self.headingtext = u''
183         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
184             self.inotherheading = False
185             self.text = self.text \
186                 + u'\n\n' \
187                 + self.headingtext.encode("utf-8") \
188                 + u'\n' \
189                 + u'~' * len(self.headingtext.encode("utf-8").strip())
190             self.headingtext = u''
191         elif tag.lower() == "p":
192             self.text = self.text \
193                 + u'\n'.join(textwrap.wrap( \
194                     self.currentparagraph, 70) \
195                 )
196             self.inparagraph = False
197             self.currentparagraph = u''
198         elif tag.lower() == "blockquote":
199             self.text = self.text \
200                 + u'\n> ' \
201                 + u'\n> '.join( \
202                     [a.strip() for a in textwrap.wrap(self.blockquote, 68)] \
203                     ).encode("utf-8") \
204                 + u'\n'
205             self.inblockquote = False
206             self.blockquote = u''
207         elif tag.lower() == "pre":
208             self.inpre = False
209         elif tag.lower() == "li":
210             self.initem = False
211             if self.item != "":
212                 self.text = self.text \
213                     + u' * ' \
214                     + u'\n   '.join( \
215                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
216                     + u'\n'
217             self.item = u''
218         elif tag.lower() == "ul":
219             self.inul = False
220
221     def handle_data(self, data):
222         if self.inheadingone or self.inheadingtwo or self.inotherheading:
223             self.headingtext = self.headingtext \
224                 + unicode(data, "utf-8").strip() \
225                 + u' '
226         elif self.inblockquote:
227             self.blockquote = self.blockquote \
228                 + unicode(data, "utf-8").strip() \
229                 + u' '
230         elif self.inparagraph:
231             self.currentparagraph = self.currentparagraph \
232                 + unicode(data, "utf-8").strip() \
233                 + u' '
234         elif self.inul and self.initem:
235             self.item = self.item + unicode(data, "utf-8")
236         elif self.inpre:
237             self.text = self.text + unicode(data, "utf-8")
238         else:
239             self.text = self.text + unicode(data, "utf-8").strip() + u' '
240
241     def handle_entityref(self, name):
242         entity = name
243         if entities.has_key(name.lower()):
244             entity = entities[name.lower()]
245         elif name[0] == "#":
246             entity = unichr(int(name[1:]))
247         else:
248             entity = "&" + name + ";"
249
250         if self.inparagraph:
251             self.currentparagraph = self.currentparagraph + entity
252         elif self.inblockquote:
253             self.blockquote = self.blockquote + entity
254         else:
255             self.text = self.text + entity
256
257     def gettext(self):
258         data = self.text
259         if self.inparagraph:
260             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
261         return data
262
263 def parse_and_deliver(maildir, url, statedir):
264     fp = feedparser.parse(url)
265     db = dbm.open(os.path.join(statedir, "seen"), "c")
266     for item in fp["items"]:
267         # have we seen it before?
268         # need to work out what the content is first...
269
270         if item.has_key("content"):
271             content = item["content"][0]["value"]
272         else:
273             content = item["summary"]
274
275         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
276
277         if db.has_key(url + "|" + item["link"]):
278             data = db[url + "|" + item["link"]]
279             data = cgi.parse_qs(data)
280             if data["contentmd5"][0] == md5sum:
281                 continue
282
283         try:
284             author = item["author"]
285         except:
286             author = url
287
288         # create a basic email message
289         msg = MIMEMultipart("alternative")
290         messageid = "<" \
291             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
292             + "." \
293             + "".join( \
294                 [random.choice( \
295                     string.ascii_letters + string.digits \
296                     ) for a in range(0,6) \
297                 ]) + "@" + socket.gethostname() + ">"
298         msg.add_header("Message-ID", messageid)
299         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
300         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
301         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
302         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
303             .strftime("%a, %e %b %Y %T -0000")
304         msg.add_header("Date", createddate)
305         msg.add_header("Subject", item["title"])
306         msg.set_default_type("text/plain")
307
308         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
309         textparser = HTML2Text()
310         textparser.feed(content.encode("utf-8"))
311         textcontent = textparser.gettext()
312         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
313         msg.attach(textpart)
314         msg.attach(htmlpart)
315
316         # start by working out the filename we should be writting to, we do
317         # this following the normal maildir style rules
318         fname = str(os.getpid()) \
319             + "." + socket.gethostname() \
320             + "." + "".join( \
321                 [random.choice( \
322                     string.ascii_letters + string.digits \
323                     ) for a in range(0,10) \
324                 ]) + "." \
325             + datetime.datetime.now().strftime('%s')
326         fn = os.path.join(maildir, "tmp", fname)
327         fh = open(fn, "w")
328         fh.write(msg.as_string())
329         fh.close()
330         # now move it in to the new directory
331         newfn = os.path.join(maildir, "new", fname)
332         os.link(fn, newfn)
333         os.unlink(fn)
334
335         # now add to the database about the item
336         data = urllib.urlencode((
337             ("message-id", messageid), \
338             ("created", createddate), \
339             ("contentmd5", md5sum) \
340             ))
341         db[url + "|" + item["link"]] = data
342
343     db.close()
344
345 # first off, parse the command line arguments
346
347 oparser = OptionParser()
348 oparser.add_option(
349     "-c", "--conf", dest="conf",
350     help="location of config file"
351     )
352 oparser.add_option(
353     "-s", "--statedir", dest="statedir",
354     help="location of directory to store state in"
355     )
356
357 (options, args) = oparser.parse_args()
358
359 # check for the configfile
360
361 configfile = None
362
363 if options.conf != None:
364     # does the file exist?
365     try:
366         os.stat(options.conf)
367         configfile = options.conf
368     except:
369         # should exit here as the specified file doesn't exist
370         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
371         sys.exit(2)
372 else:
373     # check through the default locations
374     try:
375         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
376         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
377     except:
378         try:
379             os.stat("/etc/rss2maildir.conf")
380             configfile = "/etc/rss2maildir.conf"
381         except:
382             sys.stderr.write("No config file found. Exiting.\n")
383             sys.exit(2)
384
385 # Right - if we've got this far, we've got a config file, now for the hard
386 # bits...
387
388 scp = SafeConfigParser()
389 scp.read(configfile)
390
391 maildir_root = "RSSMaildir"
392 state_dir = "state"
393
394 if options.statedir != None:
395     state_dir = options.statedir
396     try:
397         mode = os.stat(state_dir)[stat.ST_MODE]
398         if not stat.S_ISDIR(mode):
399             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
400             sys.exit(1)
401     except:
402         # try to make the directory
403         try:
404             os.mkdir(state_dir)
405         except:
406             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
407             sys.exit(1)
408 elif scp.has_option("general", "state_dir"):
409     new_state_dir = scp.get("general", "state_dir")
410     try:
411         mode = os.stat(state_dir)[stat.ST_MODE]
412         if not stat.S_ISDIR(mode):
413             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
414             sys.exit(1)
415     except:
416         # try to create it
417         try:
418             os.mkdir(new_state_dir)
419             state_dir = new_state_dir
420         except:
421             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
422             sys.exit(1)
423 else:
424     try:
425         mode = os.stat(state_dir)[stat.ST_MODE]
426         if not stat.S_ISDIR(mode):
427             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
428             sys.exit(1)
429     except:
430         try:
431             os.mkdir(state_dir)
432         except:
433             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
434             sys.exit(1)
435
436 if scp.has_option("general", "maildir_root"):
437     maildir_root = scp.get("general", "maildir_root")
438
439 try:
440     mode = os.stat(maildir_root)[stat.ST_MODE]
441     if not stat.S_ISDIR(mode):
442         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
443         sys.exit(1)
444 except:
445     try:
446         os.mkdir(maildir_root)
447     except:
448         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
449         sys.exit(1)
450
451 feeds = scp.sections()
452 try:
453     feeds.remove("general")
454 except:
455     pass
456
457 for section in feeds:
458     # check if the directory exists
459     maildir = None
460     try:
461         maildir = scp.get(section, "maildir")
462     except:
463         maildir = section
464
465     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
466     maildir = os.path.join(maildir_root, maildir)
467
468     try:
469         exists = os.stat(maildir)
470         if stat.S_ISDIR(exists[stat.ST_MODE]):
471             # check if there's a new, cur and tmp directory
472             try:
473                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
474             except:
475                 os.mkdir(os.path.join(maildir, "cur"))
476                 if not stat.S_ISDIR(mode):
477                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
478             try:
479                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
480             except:
481                 os.mkdir(os.path.join(maildir, "tmp"))
482                 if not stat.S_ISDIR(mode):
483                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
484             try:
485                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
486                 if not stat.S_ISDIR(mode):
487                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
488             except:
489                 os.mkdir(os.path.join(maildir, "new"))
490         else:
491             sys.stderr.write("Broken maildir: %s\n" %(maildir))
492     except:
493         try:
494             os.mkdir(maildir)
495         except:
496             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
497             sys.exit(1)
498         try:
499             os.mkdir(os.path.join(maildir, "new"))
500             os.mkdir(os.path.join(maildir, "cur"))
501             os.mkdir(os.path.join(maildir, "tmp"))
502         except:
503             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
504             sys.exit(1)
505
506     # right - we've got the directories, we've got the section, we know the
507     # url... lets play!
508
509     parse_and_deliver(maildir, section, state_dir)