rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 entities = {
  50     "amp": "&",
  51     "lt": "<",
  52     "gt": ">",
  53     "pound": "£",
  54     "copy": "©",
  55     "apos": "'",
  56     "quote": "\"",
  57     "nbsp": " ",
  58     }
  59
  60 class HTML2Text(HTMLParser):
  61
  62     def __init__(self):
  63         self.inheadingone = False
  64         self.inheadingtwo = False
  65         self.inotherheading = False
  66         self.inparagraph = True
  67         self.inblockquote = False
  68         self.inlink = False
  69         self.text = u''
  70         self.currentparagraph = u''
  71         self.headingtext = u''
  72         self.blockquote = u''
  73         self.inpre = False
  74         self.inul = False
  75         self.initem = False
  76         self.item = u''
  77         HTMLParser.__init__(self)
  78
  79     def handle_starttag(self, tag, attrs):
  80         if tag.lower() == "h1":
  81             self.inheadingone = True
  82             self.inparagraph = False
  83         elif tag.lower() == "h2":
  84             self.inheadingtwo = True
  85             self.inparagraph = False
  86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  87             self.inotherheading = True
  88             self.inparagraph = False
  89         elif tag.lower() == "a":
  90             self.inlink = True
  91         elif tag.lower() == "br":
  92             self.handle_br()
  93         elif tag.lower() == "blockquote":
  94             self.inblockquote = True
  95             self.text = self.text + u'\n'
  96         elif tag.lower() == "p":
  97             if self.text != "":
  98                 self.text = self.text + u'\n\n'
  99             if self.inparagraph:
 100                 self.text = self.text \
 101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 102             self.currentparagraph = u''
 103             self.inparagraph = True
 104         elif tag.lower() == "pre":
 105             self.text = self.text + "\n"
 106             self.inpre = True
 107             self.inparagraph = False
 108             self.inblockquote = False
 109         elif tag.lower() == "ul":
 110             self.item = u''
 111             self.inul = True
 112             self.text = self.text + "\n"
 113         elif tag.lower() == "li" and self.inul:
 114             if not self.initem:
 115                 self.initem = True
 116                 self.item = u''
 117             else:
 118                 self.text = self.text \
 119                     + u' * ' \
 120                     + u'\n   '.join([a.strip() for a in \
 121                         textwrap.wrap(self.item, 67)]) \
 122                     + u'\n'
 123                 self.item = u''
 124
 125     def handle_startendtag(self, tag, attrs):
 126         if tag.lower() == "br":
 127             self.handle_br()
 128
 129     def handle_br(self):
 130             if self.inparagraph:
 131                 self.text = self.text \
 132                 + u'\n'.join( \
 133                     [a \
 134                         for a in textwrap.wrap( \
 135                             self.currentparagraph, 70) \
 136                     ] \
 137                 ) \
 138                 + u'\n'
 139                 self.currentparagraph = u''
 140             elif self.inblockquote:
 141                 self.text = self.text \
 142                     + u'\n> ' \
 143                     + u'\n> '.join( \
 144                         [a \
 145                             for a in textwrap.wrap( \
 146                                 self.blockquote.encode("utf-8") \
 147                                 , 68) \
 148                         ] \
 149                     ) \
 150                     + u'\n'
 151                 self.blockquote = u''
 152             else:
 153                 self.text = self.text + "\n"
 154
 155     def handle_endtag(self, tag):
 156         if tag.lower() == "h1":
 157             self.inheadingone = False
 158             self.text = self.text \
 159                 + u'\n\n' \
 160                 + self.headingtext.encode("utf-8") \
 161                 + u'\n' \
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 163             self.headingtext = u''
 164         elif tag.lower() == "h2":
 165             self.inheadingtwo = False
 166             self.text = self.text \
 167                 + u'\n\n' \
 168                 + self.headingtext.encode("utf-8") \
 169                 + u'\n' \
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 171             self.headingtext = u''
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 173             self.inotherheading = False
 174             self.text = self.text \
 175                 + u'\n\n' \
 176                 + self.headingtext.encode("utf-8") \
 177                 + u'\n' \
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 179             self.headingtext = u''
 180         elif tag.lower() == "p":
 181             self.text = self.text \
 182                 + u'\n'.join(textwrap.wrap( \
 183                     self.currentparagraph, 70) \
 184                 )
 185             self.inparagraph = False
 186             self.currentparagraph = u''
 187         elif tag.lower() == "blockquote":
 188             self.text = self.text \
 189                 + u'\n> ' \
 190                 + u'\n> '.join( \
 191                     [a.strip() \
 192                         for a in textwrap.wrap( \
 193                             self.blockquote, 68)] \
 194                     ) \
 195                 + u'\n'
 196             self.inblockquote = False
 197             self.blockquote = u''
 198         elif tag.lower() == "pre":
 199             self.inpre = False
 200         elif tag.lower() == "li":
 201             self.initem = False
 202             if self.item != "":
 203                 self.text = self.text \
 204                     + u' * ' \
 205                     + u'\n   '.join( \
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 207                     + u'\n'
 208             self.item = u''
 209         elif tag.lower() == "ul":
 210             self.inul = False
 211
 212     def handle_data(self, data):
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 214             self.headingtext = self.headingtext \
 215                 + unicode(data, "utf-8").strip() \
 216                 + u' '
 217         elif self.inblockquote:
 218             self.blockquote = self.blockquote \
 219                 + unicode(data, "utf-8").strip() \
 220                 + u' '
 221         elif self.inparagraph:
 222             self.currentparagraph = self.currentparagraph \
 223                 + unicode(data, "utf-8").strip() \
 224                 + u' '
 225         elif self.inul and self.initem:
 226             self.item = self.item + unicode(data, "utf-8")
 227         elif self.inpre:
 228             self.text = self.text + unicode(data, "utf-8")
 229         else:
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 231
 232     def handle_entityref(self, name):
 233         entity = name
 234         if entities.has_key(name.lower()):
 235             entity = entities[name.lower()]
 236         elif name[0] == "#":
 237             entity = unichr(int(name[1:]))
 238         else:
 239             entity = "&" + name + ";"
 240
 241         if self.inparagraph:
 242             self.currentparagraph = self.currentparagraph \
 243                 + unicode(entity, "utf-8")
 244         elif self.inblockquote:
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 246         else:
 247             self.text = self.text + unicode(entity, "utf-8")
 248
 249     def gettext(self):
 250         data = self.text
 251         if self.inparagraph:
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 253         return data
 254
 255 def parse_and_deliver(maildir, url, statedir):
 256     feedhandle = None
 257     headers = None
 258     # first check if we know about this feed already
 259     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 260     # we need all the parts of the url
 261     (type, rest) = urllib.splittype(url)
 262     (host, path) = urllib.splithost(rest)
 263     (host, port) = urllib.splitport(host)
 264     if port == None:
 265         port = 80
 266     if feeddb.has_key(url):
 267         data = feeddb[url]
 268         data = cgi.parse_qs(data)
 269         # now do a head on the feed to see if it's been updated
 270         conn = httplib.HTTPConnection("%s:%s" %(host, port))
 271         conn.request("HEAD", path)
 272         response = conn.getresponse()
 273         headers = response.getheaders()
 274         ischanged = False
 275         try:
 276             for header in headers:
 277                 if header[0] == "content-length":
 278                     if header[1] != data["content-length"][0]:
 279                         ischanged = True
 280                 elif header[0] == "etag":
 281                     if header[1] != data["etag"][0]:
 282                         ischanged = True
 283                 elif header[0] == "last-modified":
 284                     if header[1] != data["last-modified"][0]:
 285                         ischanged = True
 286                 elif header[0] == "content-md5":
 287                     if header[1] != data["content-md5"][0]:
 288                         ischanged = True
 289         except:
 290             ischanged = True
 291         if ischanged:
 292             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 293             conn.request("GET", path)
 294             response = conn.getresponse()
 295             headers = response.getheaders()
 296             feedhandle = response
 297         else:
 298             return # don't need to do anything, nothings changed.
 299     else:
 300         conn = httplib.HTTPConnection("%s:%s" %(host, port))
 301         conn.request("GET", path)
 302         response = conn.getresponse()
 303         headers = response.getheaders()
 304         feedhandle = response
 305
 306     fp = feedparser.parse(feedhandle)
 307     db = dbm.open(os.path.join(statedir, "seen"), "c")
 308     for item in fp["items"]:
 309         # have we seen it before?
 310         # need to work out what the content is first...
 311
 312         if item.has_key("content"):
 313             content = item["content"][0]["value"]
 314         else:
 315             content = item["summary"]
 316
 317         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 318
 319         prevmessageid = None
 320
 321         if db.has_key(url + "|" + item["link"]):
 322             data = db[url + "|" + item["link"]]
 323             data = cgi.parse_qs(data)
 324             if data.has_key("message-id"):
 325                 prevmessageid = data["message-id"][0]
 326             if data["contentmd5"][0] == md5sum:
 327                 continue
 328
 329         try:
 330             author = item["author"]
 331         except:
 332             author = url
 333
 334         # create a basic email message
 335         msg = MIMEMultipart("alternative")
 336         messageid = "<" \
 337             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 338             + "." \
 339             + "".join( \
 340                 [random.choice( \
 341                     string.ascii_letters + string.digits \
 342                     ) for a in range(0,6) \
 343                 ]) + "@" + socket.gethostname() + ">"
 344         msg.add_header("Message-ID", messageid)
 345         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 346         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 347         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 348         if prevmessageid:
 349             msg.add_header("References", prevmessageid)
 350         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 351             .strftime("%a, %e %b %Y %T -0000")
 352         msg.add_header("Date", createddate)
 353         msg.add_header("Subject", item["title"])
 354         msg.set_default_type("text/plain")
 355
 356         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 357         textparser = HTML2Text()
 358         textparser.feed(content.encode("utf-8"))
 359         textcontent = textparser.gettext()
 360         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 361         msg.attach(textpart)
 362         msg.attach(htmlpart)
 363
 364         # start by working out the filename we should be writting to, we do
 365         # this following the normal maildir style rules
 366         fname = str(os.getpid()) \
 367             + "." + socket.gethostname() \
 368             + "." + "".join( \
 369                 [random.choice( \
 370                     string.ascii_letters + string.digits \
 371                     ) for a in range(0,10) \
 372                 ]) + "." \
 373             + datetime.datetime.now().strftime('%s')
 374         fn = os.path.join(maildir, "tmp", fname)
 375         fh = open(fn, "w")
 376         fh.write(msg.as_string())
 377         fh.close()
 378         # now move it in to the new directory
 379         newfn = os.path.join(maildir, "new", fname)
 380         os.link(fn, newfn)
 381         os.unlink(fn)
 382
 383         # now add to the database about the item
 384         if prevmessageid:
 385             messageid = prevmessageid + " " + messageid
 386         data = urllib.urlencode((
 387             ("message-id", messageid), \
 388             ("created", createddate), \
 389             ("contentmd5", md5sum) \
 390             ))
 391         db[url + "|" + item["link"]] = data
 392
 393     if headers:
 394         data = []
 395         for header in headers:
 396             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 397                 data.append((header[0], header[1]))
 398         if len(data) > 0:
 399             data = urllib.urlencode(data)
 400             feeddb[url] = data
 401
 402     db.close()
 403     feeddb.close()
 404
 405 # first off, parse the command line arguments
 406
 407 oparser = OptionParser()
 408 oparser.add_option(
 409     "-c", "--conf", dest="conf",
 410     help="location of config file"
 411     )
 412 oparser.add_option(
 413     "-s", "--statedir", dest="statedir",
 414     help="location of directory to store state in"
 415     )
 416
 417 (options, args) = oparser.parse_args()
 418
 419 # check for the configfile
 420
 421 configfile = None
 422
 423 if options.conf != None:
 424     # does the file exist?
 425     try:
 426         os.stat(options.conf)
 427         configfile = options.conf
 428     except:
 429         # should exit here as the specified file doesn't exist
 430         sys.stderr.write( \
 431             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 432         sys.exit(2)
 433 else:
 434     # check through the default locations
 435     try:
 436         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 437         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 438     except:
 439         try:
 440             os.stat("/etc/rss2maildir.conf")
 441             configfile = "/etc/rss2maildir.conf"
 442         except:
 443             sys.stderr.write("No config file found. Exiting.\n")
 444             sys.exit(2)
 445
 446 # Right - if we've got this far, we've got a config file, now for the hard
 447 # bits...
 448
 449 scp = SafeConfigParser()
 450 scp.read(configfile)
 451
 452 maildir_root = "RSSMaildir"
 453 state_dir = "state"
 454
 455 if options.statedir != None:
 456     state_dir = options.statedir
 457     try:
 458         mode = os.stat(state_dir)[stat.ST_MODE]
 459         if not stat.S_ISDIR(mode):
 460             sys.stderr.write( \
 461                 "State directory (%s) is not a directory\n" %(state_dir))
 462             sys.exit(1)
 463     except:
 464         # try to make the directory
 465         try:
 466             os.mkdir(state_dir)
 467         except:
 468             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 469             sys.exit(1)
 470 elif scp.has_option("general", "state_dir"):
 471     new_state_dir = scp.get("general", "state_dir")
 472     try:
 473         mode = os.stat(state_dir)[stat.ST_MODE]
 474         if not stat.S_ISDIR(mode):
 475             sys.stderr.write( \
 476                 "State directory (%s) is not a directory\n" %(state_dir))
 477             sys.exit(1)
 478     except:
 479         # try to create it
 480         try:
 481             os.mkdir(new_state_dir)
 482             state_dir = new_state_dir
 483         except:
 484             sys.stderr.write( \
 485                 "Couldn't create state directory %s\n" %(new_state_dir))
 486             sys.exit(1)
 487 else:
 488     try:
 489         mode = os.stat(state_dir)[stat.ST_MODE]
 490         if not stat.S_ISDIR(mode):
 491             sys.stderr.write( \
 492                 "State directory %s is not a directory\n" %(state_dir))
 493             sys.exit(1)
 494     except:
 495         try:
 496             os.mkdir(state_dir)
 497         except:
 498             sys.stderr.write( \
 499                 "State directory %s could not be created\n" %(state_dir))
 500             sys.exit(1)
 501
 502 if scp.has_option("general", "maildir_root"):
 503     maildir_root = scp.get("general", "maildir_root")
 504
 505 try:
 506     mode = os.stat(maildir_root)[stat.ST_MODE]
 507     if not stat.S_ISDIR(mode):
 508         sys.stderr.write( \
 509             "Maildir Root %s is not a directory\n" \
 510             %(maildir_root))
 511         sys.exit(1)
 512 except:
 513     try:
 514         os.mkdir(maildir_root)
 515     except:
 516         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 517         sys.exit(1)
 518
 519 feeds = scp.sections()
 520 try:
 521     feeds.remove("general")
 522 except:
 523     pass
 524
 525 for section in feeds:
 526     # check if the directory exists
 527     maildir = None
 528     try:
 529         maildir = scp.get(section, "maildir")
 530     except:
 531         maildir = section
 532
 533     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 534     maildir = os.path.join(maildir_root, maildir)
 535
 536     try:
 537         exists = os.stat(maildir)
 538         if stat.S_ISDIR(exists[stat.ST_MODE]):
 539             # check if there's a new, cur and tmp directory
 540             try:
 541                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 542             except:
 543                 os.mkdir(os.path.join(maildir, "cur"))
 544                 if not stat.S_ISDIR(mode):
 545                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 546             try:
 547                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 548             except:
 549                 os.mkdir(os.path.join(maildir, "tmp"))
 550                 if not stat.S_ISDIR(mode):
 551                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 552             try:
 553                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 554                 if not stat.S_ISDIR(mode):
 555                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 556             except:
 557                 os.mkdir(os.path.join(maildir, "new"))
 558         else:
 559             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 560     except:
 561         try:
 562             os.mkdir(maildir)
 563         except:
 564             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 565             sys.exit(1)
 566         try:
 567             os.mkdir(os.path.join(maildir, "new"))
 568             os.mkdir(os.path.join(maildir, "cur"))
 569             os.mkdir(os.path.join(maildir, "tmp"))
 570         except:
 571             sys.stderr.write( \
 572                 "Couldn't create required maildir directories for %s\n" \
 573                 %(section,))
 574             sys.exit(1)
 575
 576     # right - we've got the directories, we've got the section, we know the
 577     # url... lets play!
 578
 579     parse_and_deliver(maildir, section, state_dir)