rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import mailbox
  21 import sys
  22 import os
  23 import stat
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 entities = {
  50     "amp": "&",
  51     "lt": "<",
  52     "gt": ">",
  53     "pound": "£",
  54     "copy": "©",
  55     "apos": "'",
  56     "quote": "\"",
  57     "nbsp": " ",
  58     }
  59
  60 class HTML2Text(HTMLParser):
  61
  62     def __init__(self):
  63         self.inheadingone = False
  64         self.inheadingtwo = False
  65         self.inotherheading = False
  66         self.inparagraph = True
  67         self.inblockquote = False
  68         self.inlink = False
  69         self.text = u''
  70         self.currentparagraph = u''
  71         self.headingtext = u''
  72         self.blockquote = u''
  73         self.inpre = False
  74         HTMLParser.__init__(self)
  75
  76     def handle_starttag(self, tag, attrs):
  77         if tag.lower() == "h1":
  78             self.inheadingone = True
  79             self.inparagraph = False
  80         elif tag.lower() == "h2":
  81             self.inheadingtwo = True
  82             self.inparagraph = False
  83         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  84             self.inotherheading = True
  85             self.inparagraph = False
  86         elif tag.lower() == "a":
  87             self.inlink = True
  88         elif tag.lower() == "br":
  89             if self.inparagraph:
  90                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode('utf-8') + "\n"
  91                 self.currentparagraph = ""
  92             elif self.inblockquote:
  93                 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
  94                 self.blockquote = u''
  95             else:
  96                 self.text = self.text + "\n"
  97         elif tag.lower() == "blockquote":
  98             self.inblockquote = True
  99             self.text = self.text + "\n"
 100         elif tag.lower() == "p":
 101             if self.text != "":
 102                 self.text = self.text + "\n\n"
 103             if self.inparagraph:
 104                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8")
 105             self.currentparagraph = u''
 106             self.inparagraph = True
 107         elif tag.lower() == "pre":
 108             self.text = self.text + "\n"
 109             self.inpre = True
 110             self.inparagraph = False
 111             self.inblockquote = False
 112
 113     def handle_startendtag(self, tag, attrs):
 114         if tag.lower() == "br":
 115             if self.inparagraph:
 116                 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8") + "\n"
 117                 self.currentparagraph = u''
 118             elif self.inblockquote:
 119                 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
 120                 self.blockquote = ""
 121             else:
 122                 self.text = self.text + "\n"
 123
 124     def handle_endtag(self, tag):
 125         if tag.lower() == "h1":
 126             self.inheadingone = False
 127             self.text = self.text + "\n\n" + self.headingtext + "\n" + "=" * len(self.headingtext.strip())
 128             self.headingtext = u''
 129         elif tag.lower() == "h2":
 130             self.inheadingtwo = False
 131             self.text = self.text + "\n\n" + self.headingtext + "\n" + "-" * len(self.headingtext.strip())
 132             self.headingtext = u''
 133         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 134             self.inotherheading = False
 135             self.text = self.text + "\n\n" + self.headingtext + "\n" + "~" * len(self.headingtext.strip())
 136             self.headingtext = u''
 137         elif tag.lower() == "p":
 138             self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 139             self.inparagraph = False
 140         elif tag.lower() == "blockquote":
 141             self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
 142             self.inblockquote = False
 143             self.blockquote = u''
 144         elif tag.lower() == "pre":
 145             self.inpre = False
 146
 147     def handle_data(self, data):
 148         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 149             self.headingtext = self.headingtext + unicode(data, "utf-8").strip() + u' '
 150         elif self.inblockquote:
 151             self.blockquote = self.blockquote + unicode(data, "utf-8").strip() + u' '
 152         elif self.inparagraph:
 153             self.currentparagraph = self.currentparagraph + unicode(data, "utf-8").strip() + u' '
 154         elif self.inpre:
 155             self.text = self.text + data.encode("utf-8")
 156         else:
 157             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 158
 159     def handle_entityref(self, name):
 160         entity = name
 161         if entities.has_key(name.lower()):
 162             entity = entities[name.lower()]
 163         elif name[0] == "#":
 164             entity = unichr(int(name[1:]))
 165         else:
 166             entity = "&" + name + ";"
 167
 168         if self.inparagraph:
 169             self.currentparagraph = self.currentparagraph + entity
 170         elif self.inblockquote:
 171             self.blockquote = self.blockquote + entity
 172         else:
 173             self.text = self.text + entity
 174
 175     def gettext(self):
 176         data = self.text
 177         if self.inparagraph:
 178             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 179         return data
 180
 181 def parse_and_deliver(maildir, url, statedir):
 182     md = mailbox.Maildir(maildir)
 183     fp = feedparser.parse(url)
 184     db = dbm.open(os.path.join(statedir, "seen"), "c")
 185     for item in fp["items"]:
 186         # have we seen it before?
 187         # need to work out what the content is first...
 188
 189         if item.has_key("content"):
 190             content = item["content"][0]["value"]
 191         else:
 192             content = item["summary"]
 193
 194         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 195
 196         if db.has_key(url + "|" + item["link"]):
 197             data = db[url + "|" + item["link"]]
 198             data = cgi.parse_qs(data)
 199             if data["contentmd5"][0] == md5sum:
 200                 continue
 201
 202         try:
 203             author = item["author"]
 204         except:
 205             author = url
 206
 207         # create a basic email message
 208         msg = MIMEMultipart("alternative")
 209         messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
 210         msg.add_header("Message-ID", messageid)
 211         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 212         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 213         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 214         createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
 215         msg.add_header("Date", createddate)
 216         msg.add_header("Subject", item["title"])
 217         msg.set_default_type("text/plain")
 218
 219         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 220         textparser = HTML2Text()
 221         textparser.feed(content.encode("utf-8"))
 222         textcontent = textparser.gettext()
 223         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 224         msg.attach(textpart)
 225         msg.attach(htmlpart)
 226
 227         # start by working out the filename we should be writting to, we do
 228         # this following the normal maildir style rules
 229         fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
 230         fn = os.path.join(maildir, "tmp", fname)
 231         fh = open(fn, "w")
 232         fh.write(msg.as_string())
 233         fh.close()
 234         # now move it in to the new directory
 235         newfn = os.path.join(maildir, "new", fname)
 236         os.link(fn, newfn)
 237         os.unlink(fn)
 238
 239         # now add to the database about the item
 240         data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
 241         db[url + "|" + item["link"]] = data
 242
 243     db.close()
 244
 245 # first off, parse the command line arguments
 246
 247 oparser = OptionParser()
 248 oparser.add_option(
 249     "-c", "--conf", dest="conf",
 250     help="location of config file"
 251     )
 252 oparser.add_option(
 253     "-s", "--statedir", dest="statedir",
 254     help="location of directory to store state in"
 255     )
 256
 257 (options, args) = oparser.parse_args()
 258
 259 # check for the configfile
 260
 261 configfile = None
 262
 263 if options.conf != None:
 264     # does the file exist?
 265     try:
 266         os.stat(options.conf)
 267         configfile = options.conf
 268     except:
 269         # should exit here as the specified file doesn't exist
 270         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
 271         sys.exit(2)
 272 else:
 273     # check through the default locations
 274     try:
 275         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 276         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 277     except:
 278         try:
 279             os.stat("/etc/rss2maildir.conf")
 280             configfile = "/etc/rss2maildir.conf"
 281         except:
 282             sys.stderr.write("No config file found. Exiting.\n")
 283             sys.exit(2)
 284
 285 # Right - if we've got this far, we've got a config file, now for the hard
 286 # bits...
 287
 288 scp = SafeConfigParser()
 289 scp.read(configfile)
 290
 291 maildir_root = "RSSMaildir"
 292 state_dir = "state"
 293
 294 if options.statedir != None:
 295     state_dir = options.statedir
 296     try:
 297         mode = os.stat(state_dir)[stat.ST_MODE]
 298         if not stat.S_ISDIR(mode):
 299             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 300             sys.exit(1)
 301     except:
 302         # try to make the directory
 303         try:
 304             os.mkdir(state_dir)
 305         except:
 306             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 307             sys.exit(1)
 308 elif scp.has_option("general", "state_dir"):
 309     new_state_dir = scp.get("general", "state_dir")
 310     try:
 311         mode = os.stat(state_dir)[stat.ST_MODE]
 312         if not stat.S_ISDIR(mode):
 313             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 314             sys.exit(1)
 315     except:
 316         # try to create it
 317         try:
 318             os.mkdir(new_state_dir)
 319             state_dir = new_state_dir
 320         except:
 321             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
 322             sys.exit(1)
 323 else:
 324     try:
 325         mode = os.stat(state_dir)[stat.ST_MODE]
 326         if not stat.S_ISDIR(mode):
 327             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
 328             sys.exit(1)
 329     except:
 330         try:
 331             os.mkdir(state_dir)
 332         except:
 333             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
 334             sys.exit(1)
 335
 336 if scp.has_option("general", "maildir_root"):
 337     maildir_root = scp.get("general", "maildir_root")
 338
 339 try:
 340     mode = os.stat(maildir_root)[stat.ST_MODE]
 341     if not stat.S_ISDIR(mode):
 342         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
 343         sys.exit(1)
 344 except:
 345     try:
 346         os.mkdir(maildir_root)
 347     except:
 348         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 349         sys.exit(1)
 350
 351 feeds = scp.sections()
 352 try:
 353     feeds.remove("general")
 354 except:
 355     pass
 356
 357 for section in feeds:
 358     # check if the directory exists
 359     maildir = None
 360     try:
 361         maildir = scp.get(section, "maildir")
 362     except:
 363         maildir = section
 364
 365     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 366     maildir = os.path.join(maildir_root, maildir)
 367
 368     try:
 369         exists = os.stat(maildir)
 370         if stat.S_ISDIR(exists[stat.ST_MODE]):
 371             # check if there's a new, cur and tmp directory
 372             try:
 373                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 374             except:
 375                 os.mkdir(os.path.join(maildir, "cur"))
 376                 if not stat.S_ISDIR(mode):
 377                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 378             try:
 379                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 380             except:
 381                 os.mkdir(os.path.join(maildir, "tmp"))
 382                 if not stat.S_ISDIR(mode):
 383                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 384             try:
 385                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 386                 if not stat.S_ISDIR(mode):
 387                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 388             except:
 389                 os.mkdir(os.path.join(maildir, "new"))
 390         else:
 391             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 392     except:
 393         try:
 394             os.mkdir(maildir)
 395         except:
 396             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 397             sys.exit(1)
 398         try:
 399             os.mkdir(os.path.join(maildir, "new"))
 400             os.mkdir(os.path.join(maildir, "cur"))
 401             os.mkdir(os.path.join(maildir, "tmp"))
 402         except:
 403             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
 404             sys.exit(1)
 405
 406     # right - we've got the directories, we've got the section, we know the
 407     # url... lets play!
 408
 409     parse_and_deliver(maildir, section, state_dir)