rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         HTMLParser.__init__(self)
  74
  75     def handle_starttag(self, tag, attrs):
  76         if tag.lower() == "h1":
  77             self.inheadingone = True
  78             self.inparagraph = False
  79         elif tag.lower() == "h2":
  80             self.inheadingtwo = True
  81             self.inparagraph = False
  82         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  83             self.inotherheading = True
  84             self.inparagraph = False
  85         elif tag.lower() == "a":
  86             self.inlink = True
  87         elif tag.lower() == "br":
  88             if self.inparagraph:
  89                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode('utf-8') + "\n"
  90                 self.currentparagraph = ""
  91             elif self.inblockquote:
  92                 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
  93                 self.blockquote = u''
  94             else:
  95                 self.text = self.text + "\n"
  96         elif tag.lower() == "blockquote":
  97             self.inblockquote = True
  98             self.text = self.text + "\n"
  99         elif tag.lower() == "p":
 100             if self.text != "":
 101                 self.text = self.text + "\n\n"
 102             if self.inparagraph:
 103                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8")
 104             self.currentparagraph = u''
 105             self.inparagraph = True
 106         elif tag.lower() == "pre":
 107             self.text = self.text + "\n"
 108             self.inpre = True
 109             self.inparagraph = False
 110             self.inblockquote = False
 111
 112     def handle_startendtag(self, tag, attrs):
 113         if tag.lower() == "br":
 114             if self.inparagraph:
 115                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8") + "\n"
 116                 self.currentparagraph = u''
 117             elif self.inblockquote:
 118                 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
 119                 self.blockquote = ""
 120             else:
 121                 self.text = self.text + "\n"
 122
 123     def handle_endtag(self, tag):
 124         if tag.lower() == "h1":
 125             self.inheadingone = False
 126             self.text = self.text + "\n\n" + self.headingtext + "\n" + "=" * len(self.headingtext.strip())
 127             self.headingtext = u''
 128         elif tag.lower() == "h2":
 129             self.inheadingtwo = False
 130             self.text = self.text + "\n\n" + self.headingtext + "\n" + "-" * len(self.headingtext.strip())
 131             self.headingtext = u''
 132         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 133             self.inotherheading = False
 134             self.text = self.text + "\n\n" + self.headingtext + "\n" + "~" * len(self.headingtext.strip())
 135             self.headingtext = u''
 136         elif tag.lower() == "p":
 137             self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 138             self.inparagraph = False
 139         elif tag.lower() == "blockquote":
 140             self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
 141             self.inblockquote = False
 142             self.blockquote = u''
 143         elif tag.lower() == "pre":
 144             self.inpre = False
 145
 146     def handle_data(self, data):
 147         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 148             self.headingtext = self.headingtext + unicode(data, "utf-8").strip() + u' '
 149         elif self.inblockquote:
 150             self.blockquote = self.blockquote + unicode(data, "utf-8").strip() + u' '
 151         elif self.inparagraph:
 152             self.currentparagraph = self.currentparagraph + unicode(data, "utf-8").strip() + u' '
 153         elif self.inpre:
 154             self.text = self.text + data.encode("utf-8")
 155         else:
 156             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 157
 158     def handle_entityref(self, name):
 159         entity = name
 160         if entities.has_key(name.lower()):
 161             entity = entities[name.lower()]
 162         elif name[0] == "#":
 163             entity = unichr(int(name[1:]))
 164         else:
 165             entity = "&" + name + ";"
 166
 167         if self.inparagraph:
 168             self.currentparagraph = self.currentparagraph + entity
 169         elif self.inblockquote:
 170             self.blockquote = self.blockquote + entity
 171         else:
 172             self.text = self.text + entity
 173
 174     def gettext(self):
 175         data = self.text
 176         if self.inparagraph:
 177             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 178         return data
 179
 180 def parse_and_deliver(maildir, url, statedir):
 181     fp = feedparser.parse(url)
 182     db = dbm.open(os.path.join(statedir, "seen"), "c")
 183     for item in fp["items"]:
 184         # have we seen it before?
 185         # need to work out what the content is first...
 186
 187         if item.has_key("content"):
 188             content = item["content"][0]["value"]
 189         else:
 190             content = item["summary"]
 191
 192         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 193
 194         if db.has_key(url + "|" + item["link"]):
 195             data = db[url + "|" + item["link"]]
 196             data = cgi.parse_qs(data)
 197             if data["contentmd5"][0] == md5sum:
 198                 continue
 199
 200         try:
 201             author = item["author"]
 202         except:
 203             author = url
 204
 205         # create a basic email message
 206         msg = MIMEMultipart("alternative")
 207         messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
 208         msg.add_header("Message-ID", messageid)
 209         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 210         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 211         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 212         createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
 213         msg.add_header("Date", createddate)
 214         msg.add_header("Subject", item["title"])
 215         msg.set_default_type("text/plain")
 216
 217         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 218         textparser = HTML2Text()
 219         textparser.feed(content.encode("utf-8"))
 220         textcontent = textparser.gettext()
 221         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 222         msg.attach(textpart)
 223         msg.attach(htmlpart)
 224
 225         # start by working out the filename we should be writting to, we do
 226         # this following the normal maildir style rules
 227         fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
 228         fn = os.path.join(maildir, "tmp", fname)
 229         fh = open(fn, "w")
 230         fh.write(msg.as_string())
 231         fh.close()
 232         # now move it in to the new directory
 233         newfn = os.path.join(maildir, "new", fname)
 234         os.link(fn, newfn)
 235         os.unlink(fn)
 236
 237         # now add to the database about the item
 238         data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
 239         db[url + "|" + item["link"]] = data
 240
 241     db.close()
 242
 243 # first off, parse the command line arguments
 244
 245 oparser = OptionParser()
 246 oparser.add_option(
 247     "-c", "--conf", dest="conf",
 248     help="location of config file"
 249     )
 250 oparser.add_option(
 251     "-s", "--statedir", dest="statedir",
 252     help="location of directory to store state in"
 253     )
 254
 255 (options, args) = oparser.parse_args()
 256
 257 # check for the configfile
 258
 259 configfile = None
 260
 261 if options.conf != None:
 262     # does the file exist?
 263     try:
 264         os.stat(options.conf)
 265         configfile = options.conf
 266     except:
 267         # should exit here as the specified file doesn't exist
 268         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
 269         sys.exit(2)
 270 else:
 271     # check through the default locations
 272     try:
 273         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 274         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 275     except:
 276         try:
 277             os.stat("/etc/rss2maildir.conf")
 278             configfile = "/etc/rss2maildir.conf"
 279         except:
 280             sys.stderr.write("No config file found. Exiting.\n")
 281             sys.exit(2)
 282
 283 # Right - if we've got this far, we've got a config file, now for the hard
 284 # bits...
 285
 286 scp = SafeConfigParser()
 287 scp.read(configfile)
 288
 289 maildir_root = "RSSMaildir"
 290 state_dir = "state"
 291
 292 if options.statedir != None:
 293     state_dir = options.statedir
 294     try:
 295         mode = os.stat(state_dir)[stat.ST_MODE]
 296         if not stat.S_ISDIR(mode):
 297             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 298             sys.exit(1)
 299     except:
 300         # try to make the directory
 301         try:
 302             os.mkdir(state_dir)
 303         except:
 304             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 305             sys.exit(1)
 306 elif scp.has_option("general", "state_dir"):
 307     new_state_dir = scp.get("general", "state_dir")
 308     try:
 309         mode = os.stat(state_dir)[stat.ST_MODE]
 310         if not stat.S_ISDIR(mode):
 311             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 312             sys.exit(1)
 313     except:
 314         # try to create it
 315         try:
 316             os.mkdir(new_state_dir)
 317             state_dir = new_state_dir
 318         except:
 319             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
 320             sys.exit(1)
 321 else:
 322     try:
 323         mode = os.stat(state_dir)[stat.ST_MODE]
 324         if not stat.S_ISDIR(mode):
 325             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
 326             sys.exit(1)
 327     except:
 328         try:
 329             os.mkdir(state_dir)
 330         except:
 331             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
 332             sys.exit(1)
 333
 334 if scp.has_option("general", "maildir_root"):
 335     maildir_root = scp.get("general", "maildir_root")
 336
 337 try:
 338     mode = os.stat(maildir_root)[stat.ST_MODE]
 339     if not stat.S_ISDIR(mode):
 340         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
 341         sys.exit(1)
 342 except:
 343     try:
 344         os.mkdir(maildir_root)
 345     except:
 346         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 347         sys.exit(1)
 348
 349 feeds = scp.sections()
 350 try:
 351     feeds.remove("general")
 352 except:
 353     pass
 354
 355 for section in feeds:
 356     # check if the directory exists
 357     maildir = None
 358     try:
 359         maildir = scp.get(section, "maildir")
 360     except:
 361         maildir = section
 362
 363     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 364     maildir = os.path.join(maildir_root, maildir)
 365
 366     try:
 367         exists = os.stat(maildir)
 368         if stat.S_ISDIR(exists[stat.ST_MODE]):
 369             # check if there's a new, cur and tmp directory
 370             try:
 371                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 372             except:
 373                 os.mkdir(os.path.join(maildir, "cur"))
 374                 if not stat.S_ISDIR(mode):
 375                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 376             try:
 377                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 378             except:
 379                 os.mkdir(os.path.join(maildir, "tmp"))
 380                 if not stat.S_ISDIR(mode):
 381                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 382             try:
 383                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 384                 if not stat.S_ISDIR(mode):
 385                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 386             except:
 387                 os.mkdir(os.path.join(maildir, "new"))
 388         else:
 389             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 390     except:
 391         try:
 392             os.mkdir(maildir)
 393         except:
 394             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 395             sys.exit(1)
 396         try:
 397             os.mkdir(os.path.join(maildir, "new"))
 398             os.mkdir(os.path.join(maildir, "cur"))
 399             os.mkdir(os.path.join(maildir, "tmp"))
 400         except:
 401             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
 402             sys.exit(1)
 403
 404     # right - we've got the directories, we've got the section, we know the
 405     # url... lets play!
 406
 407     parse_and_deliver(maildir, section, state_dir)