rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             self.handle_br()
  92         elif tag.lower() == "blockquote":
  93             self.inblockquote = True
  94             self.text = self.text + u'\n'
  95         elif tag.lower() == "p":
  96             if self.text != "":
  97                 self.text = self.text + u'\n\n'
  98             if self.inparagraph:
  99                 self.text = self.text \
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 101             self.currentparagraph = u''
 102             self.inparagraph = True
 103         elif tag.lower() == "pre":
 104             self.text = self.text + "\n"
 105             self.inpre = True
 106             self.inparagraph = False
 107             self.inblockquote = False
 108         elif tag.lower() == "ul":
 109             self.item = u''
 110             self.inul = True
 111             self.text = self.text + "\n"
 112         elif tag.lower() == "li" and self.inul:
 113             if not self.initem:
 114                 self.initem = True
 115                 self.item = u''
 116             else:
 117                 self.text = self.text \
 118                     + u' * ' \
 119                     + u'\n   '.join([a.strip() for a in \
 120                         textwrap.wrap(self.item, 67)]) \
 121                     + u'\n'
 122                 self.item = u''
 123
 124     def handle_startendtag(self, tag, attrs):
 125         if tag.lower() == "br":
 126             self.handle_br()
 127
 128     def handle_br(self):
 129             if self.inparagraph:
 130                 self.text = self.text \
 131                 + u'\n'.join( \
 132                     [a \
 133                         for a in textwrap.wrap( \
 134                             self.currentparagraph, 70) \
 135                     ] \
 136                 ) \
 137                 + u'\n'
 138                 self.currentparagraph = u''
 139             elif self.inblockquote:
 140                 self.text = self.text \
 141                     + u'\n> ' \
 142                     + u'\n> '.join( \
 143                         [a \
 144                             for a in textwrap.wrap( \
 145                                 self.blockquote.encode("utf-8") \
 146                                 , 68) \
 147                         ] \
 148                     ) \
 149                     + u'\n'
 150                 self.blockquote = u''
 151             else:
 152                 self.text = self.text + "\n"
 153
 154     def handle_endtag(self, tag):
 155         if tag.lower() == "h1":
 156             self.inheadingone = False
 157             self.text = self.text \
 158                 + u'\n\n' \
 159                 + self.headingtext.encode("utf-8") \
 160                 + u'\n' \
 161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 162             self.headingtext = u''
 163         elif tag.lower() == "h2":
 164             self.inheadingtwo = False
 165             self.text = self.text \
 166                 + u'\n\n' \
 167                 + self.headingtext.encode("utf-8") \
 168                 + u'\n' \
 169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 170             self.headingtext = u''
 171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 172             self.inotherheading = False
 173             self.text = self.text \
 174                 + u'\n\n' \
 175                 + self.headingtext.encode("utf-8") \
 176                 + u'\n' \
 177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 178             self.headingtext = u''
 179         elif tag.lower() == "p":
 180             self.text = self.text \
 181                 + u'\n'.join(textwrap.wrap( \
 182                     self.currentparagraph, 70) \
 183                 )
 184             self.inparagraph = False
 185             self.currentparagraph = u''
 186         elif tag.lower() == "blockquote":
 187             self.text = self.text \
 188                 + u'\n> ' \
 189                 + u'\n> '.join( \
 190                     [a.strip() \
 191                         for a in textwrap.wrap( \
 192                             self.blockquote, 68)] \
 193                     ) \
 194                 + u'\n'
 195             self.inblockquote = False
 196             self.blockquote = u''
 197         elif tag.lower() == "pre":
 198             self.inpre = False
 199         elif tag.lower() == "li":
 200             self.initem = False
 201             if self.item != "":
 202                 self.text = self.text \
 203                     + u' * ' \
 204                     + u'\n   '.join( \
 205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 206                     + u'\n'
 207             self.item = u''
 208         elif tag.lower() == "ul":
 209             self.inul = False
 210
 211     def handle_data(self, data):
 212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 213             self.headingtext = self.headingtext \
 214                 + unicode(data, "utf-8").strip() \
 215                 + u' '
 216         elif self.inblockquote:
 217             self.blockquote = self.blockquote \
 218                 + unicode(data, "utf-8").strip() \
 219                 + u' '
 220         elif self.inparagraph:
 221             self.currentparagraph = self.currentparagraph \
 222                 + unicode(data, "utf-8").strip() \
 223                 + u' '
 224         elif self.inul and self.initem:
 225             self.item = self.item + unicode(data, "utf-8")
 226         elif self.inpre:
 227             self.text = self.text + unicode(data, "utf-8")
 228         else:
 229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 230
 231     def handle_entityref(self, name):
 232         entity = name
 233         if entities.has_key(name.lower()):
 234             entity = entities[name.lower()]
 235         elif name[0] == "#":
 236             entity = unichr(int(name[1:]))
 237         else:
 238             entity = "&" + name + ";"
 239
 240         if self.inparagraph:
 241             self.currentparagraph = self.currentparagraph \
 242                 + unicode(entity, "utf-8")
 243         elif self.inblockquote:
 244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 245         else:
 246             self.text = self.text + unicode(entity, "utf-8")
 247
 248     def gettext(self):
 249         data = self.text
 250         if self.inparagraph:
 251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 252         return data
 253
 254 def parse_and_deliver(maildir, url, statedir):
 255     fp = feedparser.parse(url)
 256     db = dbm.open(os.path.join(statedir, "seen"), "c")
 257     for item in fp["items"]:
 258         # have we seen it before?
 259         # need to work out what the content is first...
 260
 261         if item.has_key("content"):
 262             content = item["content"][0]["value"]
 263         else:
 264             content = item["summary"]
 265
 266         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 267
 268         prevmessageid = None
 269
 270         if db.has_key(url + "|" + item["link"]):
 271             data = db[url + "|" + item["link"]]
 272             data = cgi.parse_qs(data)
 273             if data.has_key("message-id"):
 274                 prevmessageid = data["message-id"][0]
 275             if data["contentmd5"][0] == md5sum:
 276                 continue
 277
 278         try:
 279             author = item["author"]
 280         except:
 281             author = url
 282
 283         # create a basic email message
 284         msg = MIMEMultipart("alternative")
 285         messageid = "<" \
 286             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 287             + "." \
 288             + "".join( \
 289                 [random.choice( \
 290                     string.ascii_letters + string.digits \
 291                     ) for a in range(0,6) \
 292                 ]) + "@" + socket.gethostname() + ">"
 293         msg.add_header("Message-ID", messageid)
 294         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 295         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 296         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 297         if prevmessageid:
 298             msg.add_header("References", prevmessageid)
 299         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 300             .strftime("%a, %e %b %Y %T -0000")
 301         msg.add_header("Date", createddate)
 302         msg.add_header("Subject", item["title"])
 303         msg.set_default_type("text/plain")
 304
 305         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 306         textparser = HTML2Text()
 307         textparser.feed(content.encode("utf-8"))
 308         textcontent = textparser.gettext()
 309         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 310         msg.attach(textpart)
 311         msg.attach(htmlpart)
 312
 313         # start by working out the filename we should be writting to, we do
 314         # this following the normal maildir style rules
 315         fname = str(os.getpid()) \
 316             + "." + socket.gethostname() \
 317             + "." + "".join( \
 318                 [random.choice( \
 319                     string.ascii_letters + string.digits \
 320                     ) for a in range(0,10) \
 321                 ]) + "." \
 322             + datetime.datetime.now().strftime('%s')
 323         fn = os.path.join(maildir, "tmp", fname)
 324         fh = open(fn, "w")
 325         fh.write(msg.as_string())
 326         fh.close()
 327         # now move it in to the new directory
 328         newfn = os.path.join(maildir, "new", fname)
 329         os.link(fn, newfn)
 330         os.unlink(fn)
 331
 332         # now add to the database about the item
 333         if prevmessageid:
 334             messageid = prevmessageid + " " + messageid
 335         data = urllib.urlencode((
 336             ("message-id", messageid), \
 337             ("created", createddate), \
 338             ("contentmd5", md5sum) \
 339             ))
 340         db[url + "|" + item["link"]] = data
 341
 342     db.close()
 343
 344 # first off, parse the command line arguments
 345
 346 oparser = OptionParser()
 347 oparser.add_option(
 348     "-c", "--conf", dest="conf",
 349     help="location of config file"
 350     )
 351 oparser.add_option(
 352     "-s", "--statedir", dest="statedir",
 353     help="location of directory to store state in"
 354     )
 355
 356 (options, args) = oparser.parse_args()
 357
 358 # check for the configfile
 359
 360 configfile = None
 361
 362 if options.conf != None:
 363     # does the file exist?
 364     try:
 365         os.stat(options.conf)
 366         configfile = options.conf
 367     except:
 368         # should exit here as the specified file doesn't exist
 369         sys.stderr.write( \
 370             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 371         sys.exit(2)
 372 else:
 373     # check through the default locations
 374     try:
 375         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 376         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 377     except:
 378         try:
 379             os.stat("/etc/rss2maildir.conf")
 380             configfile = "/etc/rss2maildir.conf"
 381         except:
 382             sys.stderr.write("No config file found. Exiting.\n")
 383             sys.exit(2)
 384
 385 # Right - if we've got this far, we've got a config file, now for the hard
 386 # bits...
 387
 388 scp = SafeConfigParser()
 389 scp.read(configfile)
 390
 391 maildir_root = "RSSMaildir"
 392 state_dir = "state"
 393
 394 if options.statedir != None:
 395     state_dir = options.statedir
 396     try:
 397         mode = os.stat(state_dir)[stat.ST_MODE]
 398         if not stat.S_ISDIR(mode):
 399             sys.stderr.write( \
 400                 "State directory (%s) is not a directory\n" %(state_dir))
 401             sys.exit(1)
 402     except:
 403         # try to make the directory
 404         try:
 405             os.mkdir(state_dir)
 406         except:
 407             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 408             sys.exit(1)
 409 elif scp.has_option("general", "state_dir"):
 410     new_state_dir = scp.get("general", "state_dir")
 411     try:
 412         mode = os.stat(state_dir)[stat.ST_MODE]
 413         if not stat.S_ISDIR(mode):
 414             sys.stderr.write( \
 415                 "State directory (%s) is not a directory\n" %(state_dir))
 416             sys.exit(1)
 417     except:
 418         # try to create it
 419         try:
 420             os.mkdir(new_state_dir)
 421             state_dir = new_state_dir
 422         except:
 423             sys.stderr.write( \
 424                 "Couldn't create state directory %s\n" %(new_state_dir))
 425             sys.exit(1)
 426 else:
 427     try:
 428         mode = os.stat(state_dir)[stat.ST_MODE]
 429         if not stat.S_ISDIR(mode):
 430             sys.stderr.write( \
 431                 "State directory %s is not a directory\n" %(state_dir))
 432             sys.exit(1)
 433     except:
 434         try:
 435             os.mkdir(state_dir)
 436         except:
 437             sys.stderr.write( \
 438                 "State directory %s could not be created\n" %(state_dir))
 439             sys.exit(1)
 440
 441 if scp.has_option("general", "maildir_root"):
 442     maildir_root = scp.get("general", "maildir_root")
 443
 444 try:
 445     mode = os.stat(maildir_root)[stat.ST_MODE]
 446     if not stat.S_ISDIR(mode):
 447         sys.stderr.write( \
 448             "Maildir Root %s is not a directory\n" \
 449             %(maildir_root))
 450         sys.exit(1)
 451 except:
 452     try:
 453         os.mkdir(maildir_root)
 454     except:
 455         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 456         sys.exit(1)
 457
 458 feeds = scp.sections()
 459 try:
 460     feeds.remove("general")
 461 except:
 462     pass
 463
 464 for section in feeds:
 465     # check if the directory exists
 466     maildir = None
 467     try:
 468         maildir = scp.get(section, "maildir")
 469     except:
 470         maildir = section
 471
 472     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 473     maildir = os.path.join(maildir_root, maildir)
 474
 475     try:
 476         exists = os.stat(maildir)
 477         if stat.S_ISDIR(exists[stat.ST_MODE]):
 478             # check if there's a new, cur and tmp directory
 479             try:
 480                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 481             except:
 482                 os.mkdir(os.path.join(maildir, "cur"))
 483                 if not stat.S_ISDIR(mode):
 484                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 485             try:
 486                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 487             except:
 488                 os.mkdir(os.path.join(maildir, "tmp"))
 489                 if not stat.S_ISDIR(mode):
 490                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 491             try:
 492                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 493                 if not stat.S_ISDIR(mode):
 494                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 495             except:
 496                 os.mkdir(os.path.join(maildir, "new"))
 497         else:
 498             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 499     except:
 500         try:
 501             os.mkdir(maildir)
 502         except:
 503             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 504             sys.exit(1)
 505         try:
 506             os.mkdir(os.path.join(maildir, "new"))
 507             os.mkdir(os.path.join(maildir, "cur"))
 508             os.mkdir(os.path.join(maildir, "tmp"))
 509         except:
 510             sys.stderr.write( \
 511                 "Couldn't create required maildir directories for %s\n" \
 512                 %(section,))
 513             sys.exit(1)
 514
 515     # right - we've got the directories, we've got the section, we know the
 516     # url... lets play!
 517
 518     parse_and_deliver(maildir, section, state_dir)