* improve handling of unicode data
[rss2maildir.git] / rss2maildir.py
1 #!/usr/bin/python
2 # coding=utf-8
3
4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20 import sys
21 import os
22 import stat
23 import urllib
24
25 import feedparser
26
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
29
30 import datetime
31 import random
32 import string
33 import textwrap
34
35 import socket
36
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
39
40 from base64 import b64encode
41 import md5
42
43 import cgi
44 import dbm
45
46 from HTMLParser import HTMLParser
47
48 entities = {
49     "amp": "&",
50     "lt": "<",
51     "gt": ">",
52     "pound": "£",
53     "copy": "©",
54     "apos": "'",
55     "quote": "\"",
56     "nbsp": " ",
57     }
58
59 class HTML2Text(HTMLParser):
60     
61     def __init__(self):
62         self.inheadingone = False
63         self.inheadingtwo = False
64         self.inotherheading = False
65         self.inparagraph = True
66         self.inblockquote = False
67         self.inlink = False
68         self.text = u''
69         self.currentparagraph = u''
70         self.headingtext = u''
71         self.blockquote = u''
72         self.inpre = False
73         self.inul = False
74         self.initem = False
75         self.item = u''
76         HTMLParser.__init__(self)
77
78     def handle_starttag(self, tag, attrs):
79         if tag.lower() == "h1":
80             self.inheadingone = True
81             self.inparagraph = False
82         elif tag.lower() == "h2":
83             self.inheadingtwo = True
84             self.inparagraph = False
85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86             self.inotherheading = True
87             self.inparagraph = False
88         elif tag.lower() == "a":
89             self.inlink = True
90         elif tag.lower() == "br":
91             if self.inparagraph:
92                 self.text = self.text \
93                     + u'\n'.join( \
94                         textwrap.wrap(self.currentparagraph, 70)) \
95                     + u'\n'
96                 self.currentparagraph = ""
97             elif self.inblockquote:
98                 self.text = self.text \
99                     + u'\n> ' \
100                     + u'\n> '.join( \
101                         [a.strip() \
102                             for a in textwrap.wrap(self.blockquote, 68) \
103                         ]) \
104                     + u'\n'
105                 self.blockquote = u''
106             else:
107                 self.text = self.text + u'\n'
108         elif tag.lower() == "blockquote":
109             self.inblockquote = True
110             self.text = self.text + u'\n'
111         elif tag.lower() == "p":
112             if self.text != "":
113                 self.text = self.text + u'\n\n'
114             if self.inparagraph:
115                 self.text = self.text \
116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117             self.currentparagraph = u''
118             self.inparagraph = True
119         elif tag.lower() == "pre":
120             self.text = self.text + "\n"
121             self.inpre = True
122             self.inparagraph = False
123             self.inblockquote = False
124         elif tag.lower() == "ul":
125             self.item = u''
126             self.inul = True
127             self.text = self.text + "\n"
128         elif tag.lower() == "li" and self.inul:
129             if not self.initem:
130                 self.initem = True
131                 self.item = u''
132             else:
133                 self.text = self.text \
134                     + u' * ' \
135                     + u'\n   '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
136                     + u'\n'
137                 self.item = u''
138
139     def handle_startendtag(self, tag, attrs):
140         if tag.lower() == "br":
141             if self.inparagraph:
142                 self.text = self.text \
143                 + u'\n'.join( \
144                     [a \
145                         for a in textwrap.wrap( \
146                             self.currentparagraph, 70) \
147                     ] \
148                 ) \
149                 + u'\n'
150                 self.currentparagraph = u''
151             elif self.inblockquote:
152                 self.text = self.text \
153                     + u'\n> ' \
154                     + u'\n> '.join( \
155                         [a \
156                             for a in textwrap.wrap( \
157                                 self.blockquote.encode("utf-8") \
158                                 , 68) \
159                         ] \
160                     ) \
161                     + u'\n'
162                 self.blockquote = u''
163             else:
164                 self.text = self.text + "\n"
165
166     def handle_endtag(self, tag):
167         if tag.lower() == "h1":
168             self.inheadingone = False
169             self.text = self.text \
170                 + u'\n\n' \
171                 + self.headingtext.encode("utf-8") \
172                 + u'\n' \
173                 + u'=' * len(self.headingtext.encode("utf-8").strip())
174             self.headingtext = u''
175         elif tag.lower() == "h2":
176             self.inheadingtwo = False
177             self.text = self.text \
178                 + u'\n\n' \
179                 + self.headingtext.encode("utf-8") \
180                 + u'\n' \
181                 + u'-' * len(self.headingtext.encode("utf-8").strip())
182             self.headingtext = u''
183         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
184             self.inotherheading = False
185             self.text = self.text \
186                 + u'\n\n' \
187                 + self.headingtext.encode("utf-8") \
188                 + u'\n' \
189                 + u'~' * len(self.headingtext.encode("utf-8").strip())
190             self.headingtext = u''
191         elif tag.lower() == "p":
192             self.text = self.text \
193                 + u'\n'.join(textwrap.wrap( \
194                     self.currentparagraph, 70) \
195                 )
196             self.inparagraph = False
197             self.currentparagraph = u''
198         elif tag.lower() == "blockquote":
199             self.text = self.text \
200                 + u'\n> ' \
201                 + u'\n> '.join( \
202                     [a.strip() \
203                         for a in textwrap.wrap( \
204                             self.blockquote, 68)] \
205                     ) \
206                 + u'\n'
207             self.inblockquote = False
208             self.blockquote = u''
209         elif tag.lower() == "pre":
210             self.inpre = False
211         elif tag.lower() == "li":
212             self.initem = False
213             if self.item != "":
214                 self.text = self.text \
215                     + u' * ' \
216                     + u'\n   '.join( \
217                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
218                     + u'\n'
219             self.item = u''
220         elif tag.lower() == "ul":
221             self.inul = False
222
223     def handle_data(self, data):
224         if self.inheadingone or self.inheadingtwo or self.inotherheading:
225             self.headingtext = self.headingtext \
226                 + unicode(data, "utf-8").strip() \
227                 + u' '
228         elif self.inblockquote:
229             self.blockquote = self.blockquote \
230                 + unicode(data, "utf-8").strip() \
231                 + u' '
232         elif self.inparagraph:
233             self.currentparagraph = self.currentparagraph \
234                 + unicode(data, "utf-8").strip() \
235                 + u' '
236         elif self.inul and self.initem:
237             self.item = self.item + unicode(data, "utf-8")
238         elif self.inpre:
239             self.text = self.text + unicode(data, "utf-8")
240         else:
241             self.text = self.text + unicode(data, "utf-8").strip() + u' '
242
243     def handle_entityref(self, name):
244         entity = name
245         if entities.has_key(name.lower()):
246             entity = entities[name.lower()]
247         elif name[0] == "#":
248             entity = unichr(int(name[1:]))
249         else:
250             entity = "&" + name + ";"
251
252         if self.inparagraph:
253             self.currentparagraph = self.currentparagraph + unicode(entity, "utf-8")
254         elif self.inblockquote:
255             self.blockquote = self.blockquote + unicode(entity, "utf-8")
256         else:
257             self.text = self.text + unicode(entity, "utf-8")
258
259     def gettext(self):
260         data = self.text
261         if self.inparagraph:
262             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
263         return data
264
265 def parse_and_deliver(maildir, url, statedir):
266     fp = feedparser.parse(url)
267     db = dbm.open(os.path.join(statedir, "seen"), "c")
268     for item in fp["items"]:
269         # have we seen it before?
270         # need to work out what the content is first...
271
272         if item.has_key("content"):
273             content = item["content"][0]["value"]
274         else:
275             content = item["summary"]
276
277         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
278
279         if db.has_key(url + "|" + item["link"]):
280             data = db[url + "|" + item["link"]]
281             data = cgi.parse_qs(data)
282             if data["contentmd5"][0] == md5sum:
283                 continue
284
285         try:
286             author = item["author"]
287         except:
288             author = url
289
290         # create a basic email message
291         msg = MIMEMultipart("alternative")
292         messageid = "<" \
293             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
294             + "." \
295             + "".join( \
296                 [random.choice( \
297                     string.ascii_letters + string.digits \
298                     ) for a in range(0,6) \
299                 ]) + "@" + socket.gethostname() + ">"
300         msg.add_header("Message-ID", messageid)
301         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
302         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
303         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
304         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
305             .strftime("%a, %e %b %Y %T -0000")
306         msg.add_header("Date", createddate)
307         msg.add_header("Subject", item["title"])
308         msg.set_default_type("text/plain")
309
310         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
311         textparser = HTML2Text()
312         textparser.feed(content.encode("utf-8"))
313         textcontent = textparser.gettext()
314         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
315         msg.attach(textpart)
316         msg.attach(htmlpart)
317
318         # start by working out the filename we should be writting to, we do
319         # this following the normal maildir style rules
320         fname = str(os.getpid()) \
321             + "." + socket.gethostname() \
322             + "." + "".join( \
323                 [random.choice( \
324                     string.ascii_letters + string.digits \
325                     ) for a in range(0,10) \
326                 ]) + "." \
327             + datetime.datetime.now().strftime('%s')
328         fn = os.path.join(maildir, "tmp", fname)
329         fh = open(fn, "w")
330         fh.write(msg.as_string())
331         fh.close()
332         # now move it in to the new directory
333         newfn = os.path.join(maildir, "new", fname)
334         os.link(fn, newfn)
335         os.unlink(fn)
336
337         # now add to the database about the item
338         data = urllib.urlencode((
339             ("message-id", messageid), \
340             ("created", createddate), \
341             ("contentmd5", md5sum) \
342             ))
343         db[url + "|" + item["link"]] = data
344
345     db.close()
346
347 # first off, parse the command line arguments
348
349 oparser = OptionParser()
350 oparser.add_option(
351     "-c", "--conf", dest="conf",
352     help="location of config file"
353     )
354 oparser.add_option(
355     "-s", "--statedir", dest="statedir",
356     help="location of directory to store state in"
357     )
358
359 (options, args) = oparser.parse_args()
360
361 # check for the configfile
362
363 configfile = None
364
365 if options.conf != None:
366     # does the file exist?
367     try:
368         os.stat(options.conf)
369         configfile = options.conf
370     except:
371         # should exit here as the specified file doesn't exist
372         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
373         sys.exit(2)
374 else:
375     # check through the default locations
376     try:
377         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
378         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
379     except:
380         try:
381             os.stat("/etc/rss2maildir.conf")
382             configfile = "/etc/rss2maildir.conf"
383         except:
384             sys.stderr.write("No config file found. Exiting.\n")
385             sys.exit(2)
386
387 # Right - if we've got this far, we've got a config file, now for the hard
388 # bits...
389
390 scp = SafeConfigParser()
391 scp.read(configfile)
392
393 maildir_root = "RSSMaildir"
394 state_dir = "state"
395
396 if options.statedir != None:
397     state_dir = options.statedir
398     try:
399         mode = os.stat(state_dir)[stat.ST_MODE]
400         if not stat.S_ISDIR(mode):
401             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
402             sys.exit(1)
403     except:
404         # try to make the directory
405         try:
406             os.mkdir(state_dir)
407         except:
408             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
409             sys.exit(1)
410 elif scp.has_option("general", "state_dir"):
411     new_state_dir = scp.get("general", "state_dir")
412     try:
413         mode = os.stat(state_dir)[stat.ST_MODE]
414         if not stat.S_ISDIR(mode):
415             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
416             sys.exit(1)
417     except:
418         # try to create it
419         try:
420             os.mkdir(new_state_dir)
421             state_dir = new_state_dir
422         except:
423             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
424             sys.exit(1)
425 else:
426     try:
427         mode = os.stat(state_dir)[stat.ST_MODE]
428         if not stat.S_ISDIR(mode):
429             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
430             sys.exit(1)
431     except:
432         try:
433             os.mkdir(state_dir)
434         except:
435             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
436             sys.exit(1)
437
438 if scp.has_option("general", "maildir_root"):
439     maildir_root = scp.get("general", "maildir_root")
440
441 try:
442     mode = os.stat(maildir_root)[stat.ST_MODE]
443     if not stat.S_ISDIR(mode):
444         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
445         sys.exit(1)
446 except:
447     try:
448         os.mkdir(maildir_root)
449     except:
450         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
451         sys.exit(1)
452
453 feeds = scp.sections()
454 try:
455     feeds.remove("general")
456 except:
457     pass
458
459 for section in feeds:
460     # check if the directory exists
461     maildir = None
462     try:
463         maildir = scp.get(section, "maildir")
464     except:
465         maildir = section
466
467     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
468     maildir = os.path.join(maildir_root, maildir)
469
470     try:
471         exists = os.stat(maildir)
472         if stat.S_ISDIR(exists[stat.ST_MODE]):
473             # check if there's a new, cur and tmp directory
474             try:
475                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
476             except:
477                 os.mkdir(os.path.join(maildir, "cur"))
478                 if not stat.S_ISDIR(mode):
479                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
480             try:
481                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
482             except:
483                 os.mkdir(os.path.join(maildir, "tmp"))
484                 if not stat.S_ISDIR(mode):
485                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
486             try:
487                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
488                 if not stat.S_ISDIR(mode):
489                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
490             except:
491                 os.mkdir(os.path.join(maildir, "new"))
492         else:
493             sys.stderr.write("Broken maildir: %s\n" %(maildir))
494     except:
495         try:
496             os.mkdir(maildir)
497         except:
498             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
499             sys.exit(1)
500         try:
501             os.mkdir(os.path.join(maildir, "new"))
502             os.mkdir(os.path.join(maildir, "cur"))
503             os.mkdir(os.path.join(maildir, "tmp"))
504         except:
505             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
506             sys.exit(1)
507
508     # right - we've got the directories, we've got the section, we know the
509     # url... lets play!
510
511     parse_and_deliver(maildir, section, state_dir)