rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 entities = {
  50     "amp": "&",
  51     "lt": "<",
  52     "gt": ">",
  53     "pound": "£",
  54     "copy": "©",
  55     "apos": "'",
  56     "quot": "\"",
  57     "nbsp": " ",
  58     }
  59
  60 class HTML2Text(HTMLParser):
  61
  62     def __init__(self):
  63         self.inheadingone = False
  64         self.inheadingtwo = False
  65         self.inotherheading = False
  66         self.inparagraph = True
  67         self.inblockquote = False
  68         self.inlink = False
  69         self.text = u''
  70         self.currentparagraph = u''
  71         self.headingtext = u''
  72         self.blockquote = u''
  73         self.inpre = False
  74         self.inul = False
  75         self.initem = False
  76         self.item = u''
  77         HTMLParser.__init__(self)
  78
  79     def handle_starttag(self, tag, attrs):
  80         if tag.lower() == "h1":
  81             self.inheadingone = True
  82             self.inparagraph = False
  83         elif tag.lower() == "h2":
  84             self.inheadingtwo = True
  85             self.inparagraph = False
  86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  87             self.inotherheading = True
  88             self.inparagraph = False
  89         elif tag.lower() == "a":
  90             self.inlink = True
  91         elif tag.lower() == "br":
  92             self.handle_br()
  93         elif tag.lower() == "blockquote":
  94             self.inblockquote = True
  95             self.text = self.text + u'\n'
  96         elif tag.lower() == "p":
  97             if self.text != "":
  98                 self.text = self.text + u'\n\n'
  99             if self.inparagraph:
 100                 self.text = self.text \
 101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 102             self.currentparagraph = u''
 103             self.inparagraph = True
 104         elif tag.lower() == "pre":
 105             self.text = self.text + "\n"
 106             self.inpre = True
 107             self.inparagraph = False
 108             self.inblockquote = False
 109         elif tag.lower() == "ul":
 110             self.item = u''
 111             self.inul = True
 112             self.text = self.text + "\n"
 113         elif tag.lower() == "li" and self.inul:
 114             if not self.initem:
 115                 self.initem = True
 116                 self.item = u''
 117             else:
 118                 self.text = self.text \
 119                     + u' * ' \
 120                     + u'\n   '.join([a.strip() for a in \
 121                         textwrap.wrap(self.item, 67)]) \
 122                     + u'\n'
 123                 self.item = u''
 124
 125     def handle_startendtag(self, tag, attrs):
 126         if tag.lower() == "br":
 127             self.handle_br()
 128
 129     def handle_br(self):
 130             if self.inparagraph:
 131                 self.text = self.text \
 132                 + u'\n'.join( \
 133                     [a \
 134                         for a in textwrap.wrap( \
 135                             self.currentparagraph, 70) \
 136                     ] \
 137                 ) \
 138                 + u'\n'
 139                 self.currentparagraph = u''
 140             elif self.inblockquote:
 141                 self.text = self.text \
 142                     + u'\n> ' \
 143                     + u'\n> '.join( \
 144                         [a \
 145                             for a in textwrap.wrap( \
 146                                 self.blockquote.encode("utf-8") \
 147                                 , 68) \
 148                         ] \
 149                     ) \
 150                     + u'\n'
 151                 self.blockquote = u''
 152             else:
 153                 self.text = self.text + "\n"
 154
 155     def handle_endtag(self, tag):
 156         if tag.lower() == "h1":
 157             self.inheadingone = False
 158             self.text = self.text \
 159                 + u'\n\n' \
 160                 + self.headingtext.encode("utf-8") \
 161                 + u'\n' \
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 163             self.headingtext = u''
 164         elif tag.lower() == "h2":
 165             self.inheadingtwo = False
 166             self.text = self.text \
 167                 + u'\n\n' \
 168                 + self.headingtext.encode("utf-8") \
 169                 + u'\n' \
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 171             self.headingtext = u''
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 173             self.inotherheading = False
 174             self.text = self.text \
 175                 + u'\n\n' \
 176                 + self.headingtext.encode("utf-8") \
 177                 + u'\n' \
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 179             self.headingtext = u''
 180         elif tag.lower() == "p":
 181             self.text = self.text \
 182                 + u'\n'.join(textwrap.wrap( \
 183                     self.currentparagraph, 70) \
 184                 )
 185             self.inparagraph = False
 186             self.currentparagraph = u''
 187         elif tag.lower() == "blockquote":
 188             self.text = self.text \
 189                 + u'\n> ' \
 190                 + u'\n> '.join( \
 191                     [a.strip() \
 192                         for a in textwrap.wrap( \
 193                             self.blockquote, 68)] \
 194                     ) \
 195                 + u'\n'
 196             self.inblockquote = False
 197             self.blockquote = u''
 198         elif tag.lower() == "pre":
 199             self.inpre = False
 200         elif tag.lower() == "li":
 201             self.initem = False
 202             if self.item != "":
 203                 self.text = self.text \
 204                     + u' * ' \
 205                     + u'\n   '.join( \
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 207                     + u'\n'
 208             self.item = u''
 209         elif tag.lower() == "ul":
 210             self.inul = False
 211
 212     def handle_data(self, data):
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 214             self.headingtext = self.headingtext \
 215                 + unicode(data, "utf-8").strip() \
 216                 + u' '
 217         elif self.inblockquote:
 218             self.blockquote = self.blockquote \
 219                 + unicode(data, "utf-8").strip() \
 220                 + u' '
 221         elif self.inparagraph:
 222             self.currentparagraph = self.currentparagraph \
 223                 + unicode(data, "utf-8").strip() \
 224                 + u' '
 225         elif self.inul and self.initem:
 226             self.item = self.item + unicode(data, "utf-8")
 227         elif self.inpre:
 228             self.text = self.text + unicode(data, "utf-8")
 229         else:
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 231
 232     def handle_entityref(self, name):
 233         entity = name
 234         if entities.has_key(name.lower()):
 235             entity = entities[name.lower()]
 236         elif name[0] == "#":
 237             entity = unichr(int(name[1:]))
 238         else:
 239             entity = "&" + name + ";"
 240
 241         if self.inparagraph:
 242             self.currentparagraph = self.currentparagraph \
 243                 + unicode(entity, "utf-8")
 244         elif self.inblockquote:
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 246         else:
 247             self.text = self.text + unicode(entity, "utf-8")
 248
 249     def gettext(self):
 250         data = self.text
 251         if self.inparagraph:
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 253         return data
 254
 255 def open_url(method, url):
 256     redirectcount = 0
 257     while redirectcount < 3:
 258         (type, rest) = urllib.splittype(url)
 259         (host, path) = urllib.splithost(rest)
 260         (host, port) = urllib.splitport(host)
 261         if port == None:
 262             port = 80
 263         try:
 264             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 265             conn.request(method, path)
 266             response = conn.getresponse()
 267             if response.status in [301, 302, 303, 307]:
 268                 headers = response.getheaders()
 269                 for header in headers:
 270                     if header[0] == "location":
 271                         url = header[1]
 272             elif response.status == 200:
 273                 return response
 274         except:
 275             pass
 276         redirectcount = redirectcount + 1
 277     return None
 278
 279 def parse_and_deliver(maildir, url, statedir):
 280     feedhandle = None
 281     headers = None
 282     # first check if we know about this feed already
 283     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 284     if feeddb.has_key(url):
 285         data = feeddb[url]
 286         data = cgi.parse_qs(data)
 287         response = open_url("HEAD", url)
 288         headers = None
 289         if response:
 290             headers = response.getheaders()
 291         ischanged = False
 292         try:
 293             for header in headers:
 294                 if header[0] == "content-length":
 295                     if header[1] != data["content-length"][0]:
 296                         ischanged = True
 297                 elif header[0] == "etag":
 298                     if header[1] != data["etag"][0]:
 299                         ischanged = True
 300                 elif header[0] == "last-modified":
 301                     if header[1] != data["last-modified"][0]:
 302                         ischanged = True
 303                 elif header[0] == "content-md5":
 304                     if header[1] != data["content-md5"][0]:
 305                         ischanged = True
 306         except:
 307             ischanged = True
 308         if ischanged:
 309             response = open_url("GET", url)
 310             if response != None:
 311                 headers = response.getheaders()
 312                 feedhandle = response
 313             else:
 314                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 315                 return
 316         else:
 317             return # don't need to do anything, nothings changed.
 318     else:
 319         response = open_url("GET", url)
 320         if response != None:
 321             headers = response.getheaders()
 322             feedhandle = response
 323         else:
 324             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 325             return
 326
 327     fp = feedparser.parse(feedhandle)
 328     db = dbm.open(os.path.join(statedir, "seen"), "c")
 329     for item in fp["items"]:
 330         # have we seen it before?
 331         # need to work out what the content is first...
 332
 333         if item.has_key("content"):
 334             content = item["content"][0]["value"]
 335         else:
 336             content = item["summary"]
 337
 338         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 339
 340         prevmessageid = None
 341
 342         # check if there's a guid too - if that exists and we match the md5,
 343         # return
 344         if item.has_key("guid"):
 345             if db.has_key(url + "|" + item["guid"]):
 346                 data = db[url + "|" + item["guid"]]
 347                 data = cgi.parse_qs(data)
 348                 if data["contentmd5"][0] == md5sum:
 349                     continue
 350
 351         if db.has_key(url + "|" + item["link"]):
 352             data = db[url + "|" + item["link"]]
 353             data = cgi.parse_qs(data)
 354             if data.has_key("message-id"):
 355                 prevmessageid = data["message-id"][0]
 356             if data["contentmd5"][0] == md5sum:
 357                 continue
 358
 359         try:
 360             author = item["author"]
 361         except:
 362             author = url
 363
 364         # create a basic email message
 365         msg = MIMEMultipart("alternative")
 366         messageid = "<" \
 367             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 368             + "." \
 369             + "".join( \
 370                 [random.choice( \
 371                     string.ascii_letters + string.digits \
 372                     ) for a in range(0,6) \
 373                 ]) + "@" + socket.gethostname() + ">"
 374         msg.add_header("Message-ID", messageid)
 375         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 376         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 377         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 378         if prevmessageid:
 379             msg.add_header("References", prevmessageid)
 380         createddate = datetime.datetime.now() \
 381             .strftime("%a, %e %b %Y %T -0000")
 382         try:
 383             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 384                 .strftime("%a, %e %b %Y %T -0000")
 385         except:
 386             pass
 387         msg.add_header("Date", createddate)
 388         msg.add_header("Subject", item["title"])
 389         msg.set_default_type("text/plain")
 390
 391         htmlcontent = content.encode("utf-8")
 392         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 393             content, \
 394             item["link"], \
 395             item["link"] )
 396         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 397         textparser = HTML2Text()
 398         textparser.feed(content.encode("utf-8"))
 399         textcontent = textparser.gettext()
 400         textcontent = "%s\n\nItem URL: %s" %( \
 401             textcontent, \
 402             item["link"] )
 403         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 404         msg.attach(textpart)
 405         msg.attach(htmlpart)
 406
 407         # start by working out the filename we should be writting to, we do
 408         # this following the normal maildir style rules
 409         fname = str(os.getpid()) \
 410             + "." + socket.gethostname() \
 411             + "." + "".join( \
 412                 [random.choice( \
 413                     string.ascii_letters + string.digits \
 414                     ) for a in range(0,10) \
 415                 ]) + "." \
 416             + datetime.datetime.now().strftime('%s')
 417         fn = os.path.join(maildir, "tmp", fname)
 418         fh = open(fn, "w")
 419         fh.write(msg.as_string())
 420         fh.close()
 421         # now move it in to the new directory
 422         newfn = os.path.join(maildir, "new", fname)
 423         os.link(fn, newfn)
 424         os.unlink(fn)
 425
 426         # now add to the database about the item
 427         if prevmessageid:
 428             messageid = prevmessageid + " " + messageid
 429         if item.has_key("guid") and item["guid"] != item["link"]:
 430             data = urllib.urlencode(( \
 431                 ("message-id", messageid), \
 432                 ("created", createddate), \
 433                 ("contentmd5", md5sum) \
 434                 ))
 435             db[url + "|" + item["guid"]] = data
 436             try:
 437                 data = db[url + "|" + item["link"]]
 438                 data = cgi.parse_qs(data)
 439                 newdata = urllib.urlencode(( \
 440                     ("message-id", messageid), \
 441                     ("created", data["created"][0]), \
 442                     ("contentmd5", data["contentmd5"][0]) \
 443                     ))
 444                 db[url + "|" + item["link"]] = newdata
 445             except:
 446                 db[url + "|" + item["link"]] = data
 447         else:
 448             data = urllib.urlencode(( \
 449                 ("message-id", messageid), \
 450                 ("created", createddate), \
 451                 ("contentmd5", md5sum) \
 452                 ))
 453             db[url + "|" + item["link"]] = data
 454
 455     if headers:
 456         data = []
 457         for header in headers:
 458             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 459                 data.append((header[0], header[1]))
 460         if len(data) > 0:
 461             data = urllib.urlencode(data)
 462             feeddb[url] = data
 463
 464     db.close()
 465     feeddb.close()
 466
 467 # first off, parse the command line arguments
 468
 469 oparser = OptionParser()
 470 oparser.add_option(
 471     "-c", "--conf", dest="conf",
 472     help="location of config file"
 473     )
 474 oparser.add_option(
 475     "-s", "--statedir", dest="statedir",
 476     help="location of directory to store state in"
 477     )
 478
 479 (options, args) = oparser.parse_args()
 480
 481 # check for the configfile
 482
 483 configfile = None
 484
 485 if options.conf != None:
 486     # does the file exist?
 487     try:
 488         os.stat(options.conf)
 489         configfile = options.conf
 490     except:
 491         # should exit here as the specified file doesn't exist
 492         sys.stderr.write( \
 493             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 494         sys.exit(2)
 495 else:
 496     # check through the default locations
 497     try:
 498         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 499         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 500     except:
 501         try:
 502             os.stat("/etc/rss2maildir.conf")
 503             configfile = "/etc/rss2maildir.conf"
 504         except:
 505             sys.stderr.write("No config file found. Exiting.\n")
 506             sys.exit(2)
 507
 508 # Right - if we've got this far, we've got a config file, now for the hard
 509 # bits...
 510
 511 scp = SafeConfigParser()
 512 scp.read(configfile)
 513
 514 maildir_root = "RSSMaildir"
 515 state_dir = "state"
 516
 517 if options.statedir != None:
 518     state_dir = options.statedir
 519     try:
 520         mode = os.stat(state_dir)[stat.ST_MODE]
 521         if not stat.S_ISDIR(mode):
 522             sys.stderr.write( \
 523                 "State directory (%s) is not a directory\n" %(state_dir))
 524             sys.exit(1)
 525     except:
 526         # try to make the directory
 527         try:
 528             os.mkdir(state_dir)
 529         except:
 530             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 531             sys.exit(1)
 532 elif scp.has_option("general", "state_dir"):
 533     new_state_dir = scp.get("general", "state_dir")
 534     try:
 535         mode = os.stat(state_dir)[stat.ST_MODE]
 536         if not stat.S_ISDIR(mode):
 537             sys.stderr.write( \
 538                 "State directory (%s) is not a directory\n" %(state_dir))
 539             sys.exit(1)
 540     except:
 541         # try to create it
 542         try:
 543             os.mkdir(new_state_dir)
 544             state_dir = new_state_dir
 545         except:
 546             sys.stderr.write( \
 547                 "Couldn't create state directory %s\n" %(new_state_dir))
 548             sys.exit(1)
 549 else:
 550     try:
 551         mode = os.stat(state_dir)[stat.ST_MODE]
 552         if not stat.S_ISDIR(mode):
 553             sys.stderr.write( \
 554                 "State directory %s is not a directory\n" %(state_dir))
 555             sys.exit(1)
 556     except:
 557         try:
 558             os.mkdir(state_dir)
 559         except:
 560             sys.stderr.write( \
 561                 "State directory %s could not be created\n" %(state_dir))
 562             sys.exit(1)
 563
 564 if scp.has_option("general", "maildir_root"):
 565     maildir_root = scp.get("general", "maildir_root")
 566
 567 try:
 568     mode = os.stat(maildir_root)[stat.ST_MODE]
 569     if not stat.S_ISDIR(mode):
 570         sys.stderr.write( \
 571             "Maildir Root %s is not a directory\n" \
 572             %(maildir_root))
 573         sys.exit(1)
 574 except:
 575     try:
 576         os.mkdir(maildir_root)
 577     except:
 578         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 579         sys.exit(1)
 580
 581 feeds = scp.sections()
 582 try:
 583     feeds.remove("general")
 584 except:
 585     pass
 586
 587 for section in feeds:
 588     # check if the directory exists
 589     maildir = None
 590     try:
 591         maildir = scp.get(section, "maildir")
 592     except:
 593         maildir = section
 594
 595     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 596     maildir = os.path.join(maildir_root, maildir)
 597
 598     try:
 599         exists = os.stat(maildir)
 600         if stat.S_ISDIR(exists[stat.ST_MODE]):
 601             # check if there's a new, cur and tmp directory
 602             try:
 603                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 604             except:
 605                 os.mkdir(os.path.join(maildir, "cur"))
 606                 if not stat.S_ISDIR(mode):
 607                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 608             try:
 609                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 610             except:
 611                 os.mkdir(os.path.join(maildir, "tmp"))
 612                 if not stat.S_ISDIR(mode):
 613                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 614             try:
 615                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 616                 if not stat.S_ISDIR(mode):
 617                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 618             except:
 619                 os.mkdir(os.path.join(maildir, "new"))
 620         else:
 621             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 622     except:
 623         try:
 624             os.mkdir(maildir)
 625         except:
 626             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 627             sys.exit(1)
 628         try:
 629             os.mkdir(os.path.join(maildir, "new"))
 630             os.mkdir(os.path.join(maildir, "cur"))
 631             os.mkdir(os.path.join(maildir, "tmp"))
 632         except:
 633             sys.stderr.write( \
 634                 "Couldn't create required maildir directories for %s\n" \
 635                 %(section,))
 636             sys.exit(1)
 637
 638     # right - we've got the directories, we've got the section, we know the
 639     # url... lets play!
 640
 641     parse_and_deliver(maildir, section, state_dir)