rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 entities = {
  50     "amp": "&",
  51     "lt": "<",
  52     "gt": ">",
  53     "pound": "£",
  54     "copy": "©",
  55     "apos": "'",
  56     "quot": "\"",
  57     "nbsp": " ",
  58     }
  59
  60 class HTML2Text(HTMLParser):
  61
  62     def __init__(self):
  63         self.inheadingone = False
  64         self.inheadingtwo = False
  65         self.inotherheading = False
  66         self.inparagraph = True
  67         self.inblockquote = False
  68         self.inlink = False
  69         self.text = u''
  70         self.currentparagraph = u''
  71         self.headingtext = u''
  72         self.blockquote = u''
  73         self.inpre = False
  74         self.inul = False
  75         self.initem = False
  76         self.item = u''
  77         HTMLParser.__init__(self)
  78
  79     def handle_starttag(self, tag, attrs):
  80         if tag.lower() == "h1":
  81             self.inheadingone = True
  82             self.inparagraph = False
  83         elif tag.lower() == "h2":
  84             self.inheadingtwo = True
  85             self.inparagraph = False
  86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  87             self.inotherheading = True
  88             self.inparagraph = False
  89         elif tag.lower() == "a":
  90             self.inlink = True
  91         elif tag.lower() == "br":
  92             self.handle_br()
  93         elif tag.lower() == "blockquote":
  94             self.inblockquote = True
  95             self.text = self.text + u'\n'
  96         elif tag.lower() == "p":
  97             if self.text != "":
  98                 self.text = self.text + u'\n\n'
  99             if self.inparagraph:
 100                 self.text = self.text \
 101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 102             self.currentparagraph = u''
 103             self.inparagraph = True
 104         elif tag.lower() == "pre":
 105             self.text = self.text + "\n"
 106             self.inpre = True
 107             self.inparagraph = False
 108             self.inblockquote = False
 109         elif tag.lower() == "ul":
 110             self.item = u''
 111             self.inul = True
 112             self.text = self.text + "\n"
 113         elif tag.lower() == "li" and self.inul:
 114             if not self.initem:
 115                 self.initem = True
 116                 self.item = u''
 117             else:
 118                 self.text = self.text \
 119                     + u' * ' \
 120                     + u'\n   '.join([a.strip() for a in \
 121                         textwrap.wrap(self.item, 67)]) \
 122                     + u'\n'
 123                 self.item = u''
 124
 125     def handle_startendtag(self, tag, attrs):
 126         if tag.lower() == "br":
 127             self.handle_br()
 128
 129     def handle_br(self):
 130             if self.inparagraph:
 131                 self.text = self.text \
 132                 + u'\n'.join( \
 133                     [a \
 134                         for a in textwrap.wrap( \
 135                             self.currentparagraph, 70) \
 136                     ] \
 137                 ) \
 138                 + u'\n'
 139                 self.currentparagraph = u''
 140             elif self.inblockquote:
 141                 self.text = self.text \
 142                     + u'\n> ' \
 143                     + u'\n> '.join( \
 144                         [a \
 145                             for a in textwrap.wrap( \
 146                                 self.blockquote.encode("utf-8") \
 147                                 , 68) \
 148                         ] \
 149                     ) \
 150                     + u'\n'
 151                 self.blockquote = u''
 152             else:
 153                 self.text = self.text + "\n"
 154
 155     def handle_endtag(self, tag):
 156         if tag.lower() == "h1":
 157             self.inheadingone = False
 158             self.text = self.text \
 159                 + u'\n\n' \
 160                 + self.headingtext.encode("utf-8") \
 161                 + u'\n' \
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 163             self.headingtext = u''
 164         elif tag.lower() == "h2":
 165             self.inheadingtwo = False
 166             self.text = self.text \
 167                 + u'\n\n' \
 168                 + self.headingtext.encode("utf-8") \
 169                 + u'\n' \
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 171             self.headingtext = u''
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 173             self.inotherheading = False
 174             self.text = self.text \
 175                 + u'\n\n' \
 176                 + self.headingtext.encode("utf-8") \
 177                 + u'\n' \
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 179             self.headingtext = u''
 180         elif tag.lower() == "p":
 181             self.text = self.text \
 182                 + u'\n'.join(textwrap.wrap( \
 183                     self.currentparagraph, 70) \
 184                 )
 185             self.inparagraph = False
 186             self.currentparagraph = u''
 187         elif tag.lower() == "blockquote":
 188             self.text = self.text \
 189                 + u'\n> ' \
 190                 + u'\n> '.join( \
 191                     [a.strip() \
 192                         for a in textwrap.wrap( \
 193                             self.blockquote, 68)] \
 194                     ) \
 195                 + u'\n'
 196             self.inblockquote = False
 197             self.blockquote = u''
 198         elif tag.lower() == "pre":
 199             self.inpre = False
 200         elif tag.lower() == "li":
 201             self.initem = False
 202             if self.item != "":
 203                 self.text = self.text \
 204                     + u' * ' \
 205                     + u'\n   '.join( \
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 207                     + u'\n'
 208             self.item = u''
 209         elif tag.lower() == "ul":
 210             self.inul = False
 211
 212     def handle_data(self, data):
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 214             self.headingtext = self.headingtext \
 215                 + unicode(data, "utf-8").strip() \
 216                 + u' '
 217         elif self.inblockquote:
 218             self.blockquote = self.blockquote \
 219                 + unicode(data, "utf-8").strip() \
 220                 + u' '
 221         elif self.inparagraph:
 222             self.currentparagraph = self.currentparagraph \
 223                 + unicode(data, "utf-8").strip() \
 224                 + u' '
 225         elif self.inul and self.initem:
 226             self.item = self.item + unicode(data, "utf-8")
 227         elif self.inpre:
 228             self.text = self.text + unicode(data, "utf-8")
 229         else:
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 231
 232     def handle_entityref(self, name):
 233         entity = name
 234         if entities.has_key(name.lower()):
 235             entity = entities[name.lower()]
 236         elif name[0] == "#":
 237             entity = unichr(int(name[1:]))
 238         else:
 239             entity = "&" + name + ";"
 240
 241         if self.inparagraph:
 242             self.currentparagraph = self.currentparagraph \
 243                 + unicode(entity, "utf-8")
 244         elif self.inblockquote:
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 246         else:
 247             self.text = self.text + unicode(entity, "utf-8")
 248
 249     def gettext(self):
 250         data = self.text
 251         if self.inparagraph:
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 253         return data
 254
 255 def parse_and_deliver(maildir, url, statedir):
 256     feedhandle = None
 257     headers = None
 258     # first check if we know about this feed already
 259     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 260     # we need all the parts of the url
 261     (type, rest) = urllib.splittype(url)
 262     (host, path) = urllib.splithost(rest)
 263     (host, port) = urllib.splitport(host)
 264     if port == None:
 265         port = 80
 266     if feeddb.has_key(url):
 267         data = feeddb[url]
 268         data = cgi.parse_qs(data)
 269         # now do a head on the feed to see if it's been updated
 270         conn = httplib.HTTPConnection("%s:%s" %(host, port))
 271         conn.request("HEAD", path)
 272         response = conn.getresponse()
 273         headers = response.getheaders()
 274         ischanged = False
 275         try:
 276             for header in headers:
 277                 if header[0] == "content-length":
 278                     if header[1] != data["content-length"][0]:
 279                         ischanged = True
 280                 elif header[0] == "etag":
 281                     if header[1] != data["etag"][0]:
 282                         ischanged = True
 283                 elif header[0] == "last-modified":
 284                     if header[1] != data["last-modified"][0]:
 285                         ischanged = True
 286                 elif header[0] == "content-md5":
 287                     if header[1] != data["content-md5"][0]:
 288                         ischanged = True
 289         except:
 290             ischanged = True
 291         if ischanged:
 292             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 293             conn.request("GET", path)
 294             response = conn.getresponse()
 295             headers = response.getheaders()
 296             feedhandle = response
 297         else:
 298             return # don't need to do anything, nothings changed.
 299     else:
 300         conn = httplib.HTTPConnection("%s:%s" %(host, port))
 301         conn.request("GET", path)
 302         response = None
 303         try:
 304             response = conn.getresponse()
 305         except:
 306             print "Failed to fetch feed: %s" %(url)
 307             return
 308         headers = response.getheaders()
 309         feedhandle = response
 310
 311     fp = feedparser.parse(feedhandle)
 312     db = dbm.open(os.path.join(statedir, "seen"), "c")
 313     for item in fp["items"]:
 314         # have we seen it before?
 315         # need to work out what the content is first...
 316
 317         if item.has_key("content"):
 318             content = item["content"][0]["value"]
 319         else:
 320             content = item["summary"]
 321
 322         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 323
 324         prevmessageid = None
 325
 326         if db.has_key(url + "|" + item["link"]):
 327             data = db[url + "|" + item["link"]]
 328             data = cgi.parse_qs(data)
 329             if data.has_key("message-id"):
 330                 prevmessageid = data["message-id"][0]
 331             if data["contentmd5"][0] == md5sum:
 332                 continue
 333
 334         try:
 335             author = item["author"]
 336         except:
 337             author = url
 338
 339         # create a basic email message
 340         msg = MIMEMultipart("alternative")
 341         messageid = "<" \
 342             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 343             + "." \
 344             + "".join( \
 345                 [random.choice( \
 346                     string.ascii_letters + string.digits \
 347                     ) for a in range(0,6) \
 348                 ]) + "@" + socket.gethostname() + ">"
 349         msg.add_header("Message-ID", messageid)
 350         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 351         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 352         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 353         if prevmessageid:
 354             msg.add_header("References", prevmessageid)
 355         createddate = datetime.datetime.now() \
 356             .strftime("%a, %e %b %Y %T -0000")
 357         try:
 358             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 359                 .strftime("%a, %e %b %Y %T -0000")
 360         except:
 361             pass
 362         msg.add_header("Date", createddate)
 363         msg.add_header("Subject", item["title"])
 364         msg.set_default_type("text/plain")
 365
 366         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 367         textparser = HTML2Text()
 368         textparser.feed(content.encode("utf-8"))
 369         textcontent = textparser.gettext()
 370         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 371         msg.attach(textpart)
 372         msg.attach(htmlpart)
 373
 374         # start by working out the filename we should be writting to, we do
 375         # this following the normal maildir style rules
 376         fname = str(os.getpid()) \
 377             + "." + socket.gethostname() \
 378             + "." + "".join( \
 379                 [random.choice( \
 380                     string.ascii_letters + string.digits \
 381                     ) for a in range(0,10) \
 382                 ]) + "." \
 383             + datetime.datetime.now().strftime('%s')
 384         fn = os.path.join(maildir, "tmp", fname)
 385         fh = open(fn, "w")
 386         fh.write(msg.as_string())
 387         fh.close()
 388         # now move it in to the new directory
 389         newfn = os.path.join(maildir, "new", fname)
 390         os.link(fn, newfn)
 391         os.unlink(fn)
 392
 393         # now add to the database about the item
 394         if prevmessageid:
 395             messageid = prevmessageid + " " + messageid
 396         data = urllib.urlencode((
 397             ("message-id", messageid), \
 398             ("created", createddate), \
 399             ("contentmd5", md5sum) \
 400             ))
 401         db[url + "|" + item["link"]] = data
 402
 403     if headers:
 404         data = []
 405         for header in headers:
 406             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 407                 data.append((header[0], header[1]))
 408         if len(data) > 0:
 409             data = urllib.urlencode(data)
 410             feeddb[url] = data
 411
 412     db.close()
 413     feeddb.close()
 414
 415 # first off, parse the command line arguments
 416
 417 oparser = OptionParser()
 418 oparser.add_option(
 419     "-c", "--conf", dest="conf",
 420     help="location of config file"
 421     )
 422 oparser.add_option(
 423     "-s", "--statedir", dest="statedir",
 424     help="location of directory to store state in"
 425     )
 426
 427 (options, args) = oparser.parse_args()
 428
 429 # check for the configfile
 430
 431 configfile = None
 432
 433 if options.conf != None:
 434     # does the file exist?
 435     try:
 436         os.stat(options.conf)
 437         configfile = options.conf
 438     except:
 439         # should exit here as the specified file doesn't exist
 440         sys.stderr.write( \
 441             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 442         sys.exit(2)
 443 else:
 444     # check through the default locations
 445     try:
 446         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 447         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 448     except:
 449         try:
 450             os.stat("/etc/rss2maildir.conf")
 451             configfile = "/etc/rss2maildir.conf"
 452         except:
 453             sys.stderr.write("No config file found. Exiting.\n")
 454             sys.exit(2)
 455
 456 # Right - if we've got this far, we've got a config file, now for the hard
 457 # bits...
 458
 459 scp = SafeConfigParser()
 460 scp.read(configfile)
 461
 462 maildir_root = "RSSMaildir"
 463 state_dir = "state"
 464
 465 if options.statedir != None:
 466     state_dir = options.statedir
 467     try:
 468         mode = os.stat(state_dir)[stat.ST_MODE]
 469         if not stat.S_ISDIR(mode):
 470             sys.stderr.write( \
 471                 "State directory (%s) is not a directory\n" %(state_dir))
 472             sys.exit(1)
 473     except:
 474         # try to make the directory
 475         try:
 476             os.mkdir(state_dir)
 477         except:
 478             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 479             sys.exit(1)
 480 elif scp.has_option("general", "state_dir"):
 481     new_state_dir = scp.get("general", "state_dir")
 482     try:
 483         mode = os.stat(state_dir)[stat.ST_MODE]
 484         if not stat.S_ISDIR(mode):
 485             sys.stderr.write( \
 486                 "State directory (%s) is not a directory\n" %(state_dir))
 487             sys.exit(1)
 488     except:
 489         # try to create it
 490         try:
 491             os.mkdir(new_state_dir)
 492             state_dir = new_state_dir
 493         except:
 494             sys.stderr.write( \
 495                 "Couldn't create state directory %s\n" %(new_state_dir))
 496             sys.exit(1)
 497 else:
 498     try:
 499         mode = os.stat(state_dir)[stat.ST_MODE]
 500         if not stat.S_ISDIR(mode):
 501             sys.stderr.write( \
 502                 "State directory %s is not a directory\n" %(state_dir))
 503             sys.exit(1)
 504     except:
 505         try:
 506             os.mkdir(state_dir)
 507         except:
 508             sys.stderr.write( \
 509                 "State directory %s could not be created\n" %(state_dir))
 510             sys.exit(1)
 511
 512 if scp.has_option("general", "maildir_root"):
 513     maildir_root = scp.get("general", "maildir_root")
 514
 515 try:
 516     mode = os.stat(maildir_root)[stat.ST_MODE]
 517     if not stat.S_ISDIR(mode):
 518         sys.stderr.write( \
 519             "Maildir Root %s is not a directory\n" \
 520             %(maildir_root))
 521         sys.exit(1)
 522 except:
 523     try:
 524         os.mkdir(maildir_root)
 525     except:
 526         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 527         sys.exit(1)
 528
 529 feeds = scp.sections()
 530 try:
 531     feeds.remove("general")
 532 except:
 533     pass
 534
 535 for section in feeds:
 536     # check if the directory exists
 537     maildir = None
 538     try:
 539         maildir = scp.get(section, "maildir")
 540     except:
 541         maildir = section
 542
 543     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 544     maildir = os.path.join(maildir_root, maildir)
 545
 546     try:
 547         exists = os.stat(maildir)
 548         if stat.S_ISDIR(exists[stat.ST_MODE]):
 549             # check if there's a new, cur and tmp directory
 550             try:
 551                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 552             except:
 553                 os.mkdir(os.path.join(maildir, "cur"))
 554                 if not stat.S_ISDIR(mode):
 555                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 556             try:
 557                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 558             except:
 559                 os.mkdir(os.path.join(maildir, "tmp"))
 560                 if not stat.S_ISDIR(mode):
 561                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 562             try:
 563                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 564                 if not stat.S_ISDIR(mode):
 565                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 566             except:
 567                 os.mkdir(os.path.join(maildir, "new"))
 568         else:
 569             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 570     except:
 571         try:
 572             os.mkdir(maildir)
 573         except:
 574             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 575             sys.exit(1)
 576         try:
 577             os.mkdir(os.path.join(maildir, "new"))
 578             os.mkdir(os.path.join(maildir, "cur"))
 579             os.mkdir(os.path.join(maildir, "tmp"))
 580         except:
 581             sys.stderr.write( \
 582                 "Couldn't create required maildir directories for %s\n" \
 583                 %(section,))
 584             sys.exit(1)
 585
 586     # right - we've got the directories, we've got the section, we know the
 587     # url... lets play!
 588
 589     parse_and_deliver(maildir, section, state_dir)