rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 class HTML2Text(HTMLParser):
  50     entities = {
  51         "amp": "&",
  52         "lt": "<",
  53         "gt": ">",
  54         "pound": "£",
  55         "copy": "©",
  56         "apos": "'",
  57         "quot": "\"",
  58         "nbsp": " ",
  59         }
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             self.handle_br()
  92         elif tag.lower() == "blockquote":
  93             self.inblockquote = True
  94             self.text = self.text + u'\n'
  95         elif tag.lower() == "p":
  96             if self.text != "":
  97                 self.text = self.text + u'\n\n'
  98             if self.inparagraph:
  99                 self.text = self.text \
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 101             self.currentparagraph = u''
 102             self.inparagraph = True
 103         elif tag.lower() == "pre":
 104             self.text = self.text + "\n"
 105             self.inpre = True
 106             self.inparagraph = False
 107             self.inblockquote = False
 108         elif tag.lower() == "ul":
 109             self.item = u''
 110             self.inul = True
 111             self.text = self.text + "\n"
 112         elif tag.lower() == "li" and self.inul:
 113             if not self.initem:
 114                 self.initem = True
 115                 self.item = u''
 116             else:
 117                 self.text = self.text \
 118                     + u' * ' \
 119                     + u'\n   '.join([a.strip() for a in \
 120                         textwrap.wrap(self.item, 67)]) \
 121                     + u'\n'
 122                 self.item = u''
 123
 124     def handle_startendtag(self, tag, attrs):
 125         if tag.lower() == "br":
 126             self.handle_br()
 127
 128     def handle_br(self):
 129             if self.inparagraph:
 130                 self.text = self.text \
 131                 + u'\n'.join( \
 132                     [a \
 133                         for a in textwrap.wrap( \
 134                             self.currentparagraph, 70) \
 135                     ] \
 136                 ) \
 137                 + u'\n'
 138                 self.currentparagraph = u''
 139             elif self.inblockquote:
 140                 self.text = self.text \
 141                     + u'\n> ' \
 142                     + u'\n> '.join( \
 143                         [a \
 144                             for a in textwrap.wrap( \
 145                                 self.blockquote.encode("utf-8") \
 146                                 , 68) \
 147                         ] \
 148                     ) \
 149                     + u'\n'
 150                 self.blockquote = u''
 151             else:
 152                 self.text = self.text + "\n"
 153
 154     def handle_endtag(self, tag):
 155         if tag.lower() == "h1":
 156             self.inheadingone = False
 157             self.text = self.text \
 158                 + u'\n\n' \
 159                 + self.headingtext.encode("utf-8") \
 160                 + u'\n' \
 161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 162             self.headingtext = u''
 163         elif tag.lower() == "h2":
 164             self.inheadingtwo = False
 165             self.text = self.text \
 166                 + u'\n\n' \
 167                 + self.headingtext.encode("utf-8") \
 168                 + u'\n' \
 169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 170             self.headingtext = u''
 171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 172             self.inotherheading = False
 173             self.text = self.text \
 174                 + u'\n\n' \
 175                 + self.headingtext.encode("utf-8") \
 176                 + u'\n' \
 177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 178             self.headingtext = u''
 179         elif tag.lower() == "p":
 180             self.text = self.text \
 181                 + u'\n'.join(textwrap.wrap( \
 182                     self.currentparagraph, 70) \
 183                 )
 184             self.inparagraph = False
 185             self.currentparagraph = u''
 186         elif tag.lower() == "blockquote":
 187             self.text = self.text \
 188                 + u'\n> ' \
 189                 + u'\n> '.join( \
 190                     [a.strip() \
 191                         for a in textwrap.wrap( \
 192                             self.blockquote, 68)] \
 193                     ) \
 194                 + u'\n'
 195             self.inblockquote = False
 196             self.blockquote = u''
 197         elif tag.lower() == "pre":
 198             self.inpre = False
 199         elif tag.lower() == "li":
 200             self.initem = False
 201             if self.item != "":
 202                 self.text = self.text \
 203                     + u' * ' \
 204                     + u'\n   '.join( \
 205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 206                     + u'\n'
 207             self.item = u''
 208         elif tag.lower() == "ul":
 209             self.inul = False
 210
 211     def handle_data(self, data):
 212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 213             self.headingtext = self.headingtext \
 214                 + unicode(data, "utf-8").strip() \
 215                 + u' '
 216         elif self.inblockquote:
 217             self.blockquote = self.blockquote \
 218                 + unicode(data, "utf-8").strip() \
 219                 + u' '
 220         elif self.inparagraph:
 221             self.currentparagraph = self.currentparagraph \
 222                 + unicode(data, "utf-8").strip() \
 223                 + u' '
 224         elif self.inul and self.initem:
 225             self.item = self.item + unicode(data, "utf-8")
 226         elif self.inpre:
 227             self.text = self.text + unicode(data, "utf-8")
 228         else:
 229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 230
 231     def handle_entityref(self, name):
 232         entity = name
 233         if HTML2Text.entities.has_key(name.lower()):
 234             entity = HTML2Text.entities[name.lower()]
 235         elif name[0] == "#":
 236             entity = unichr(int(name[1:]))
 237         else:
 238             entity = "&" + name + ";"
 239
 240         if self.inparagraph:
 241             self.currentparagraph = self.currentparagraph \
 242                 + unicode(entity, "utf-8")
 243         elif self.inblockquote:
 244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 245         else:
 246             self.text = self.text + unicode(entity, "utf-8")
 247
 248     def gettext(self):
 249         data = self.text
 250         if self.inparagraph:
 251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 252         if data[-1] != '\n':
 253             data = data + '\n'
 254         return data
 255
 256 def open_url(method, url):
 257     redirectcount = 0
 258     while redirectcount < 3:
 259         (type, rest) = urllib.splittype(url)
 260         (host, path) = urllib.splithost(rest)
 261         (host, port) = urllib.splitport(host)
 262         if port == None:
 263             port = 80
 264         try:
 265             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 266             conn.request(method, path)
 267             response = conn.getresponse()
 268             if response.status in [301, 302, 303, 307]:
 269                 headers = response.getheaders()
 270                 for header in headers:
 271                     if header[0] == "location":
 272                         url = header[1]
 273             elif response.status == 200:
 274                 return response
 275         except:
 276             pass
 277         redirectcount = redirectcount + 1
 278     return None
 279
 280 def parse_and_deliver(maildir, url, statedir):
 281     feedhandle = None
 282     headers = None
 283     # first check if we know about this feed already
 284     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 285     if feeddb.has_key(url):
 286         data = feeddb[url]
 287         data = cgi.parse_qs(data)
 288         response = open_url("HEAD", url)
 289         headers = None
 290         if response:
 291             headers = response.getheaders()
 292         ischanged = False
 293         try:
 294             for header in headers:
 295                 if header[0] == "content-length":
 296                     if header[1] != data["content-length"][0]:
 297                         ischanged = True
 298                 elif header[0] == "etag":
 299                     if header[1] != data["etag"][0]:
 300                         ischanged = True
 301                 elif header[0] == "last-modified":
 302                     if header[1] != data["last-modified"][0]:
 303                         ischanged = True
 304                 elif header[0] == "content-md5":
 305                     if header[1] != data["content-md5"][0]:
 306                         ischanged = True
 307         except:
 308             ischanged = True
 309         if ischanged:
 310             response = open_url("GET", url)
 311             if response != None:
 312                 headers = response.getheaders()
 313                 feedhandle = response
 314             else:
 315                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 316                 return
 317         else:
 318             return # don't need to do anything, nothings changed.
 319     else:
 320         response = open_url("GET", url)
 321         if response != None:
 322             headers = response.getheaders()
 323             feedhandle = response
 324         else:
 325             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 326             return
 327
 328     fp = feedparser.parse(feedhandle)
 329     db = dbm.open(os.path.join(statedir, "seen"), "c")
 330     for item in fp["items"]:
 331         # have we seen it before?
 332         # need to work out what the content is first...
 333
 334         if item.has_key("content"):
 335             content = item["content"][0]["value"]
 336         else:
 337             content = item["summary"]
 338
 339         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 340
 341         prevmessageid = None
 342
 343         # check if there's a guid too - if that exists and we match the md5,
 344         # return
 345         if item.has_key("guid"):
 346             if db.has_key(url + "|" + item["guid"]):
 347                 data = db[url + "|" + item["guid"]]
 348                 data = cgi.parse_qs(data)
 349                 if data["contentmd5"][0] == md5sum:
 350                     continue
 351
 352         if db.has_key(url + "|" + item["link"]):
 353             data = db[url + "|" + item["link"]]
 354             data = cgi.parse_qs(data)
 355             if data.has_key("message-id"):
 356                 prevmessageid = data["message-id"][0]
 357             if data["contentmd5"][0] == md5sum:
 358                 continue
 359
 360         try:
 361             author = item["author"]
 362         except:
 363             author = url
 364
 365         # create a basic email message
 366         msg = MIMEMultipart("alternative")
 367         messageid = "<" \
 368             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 369             + "." \
 370             + "".join( \
 371                 [random.choice( \
 372                     string.ascii_letters + string.digits \
 373                     ) for a in range(0,6) \
 374                 ]) + "@" + socket.gethostname() + ">"
 375         msg.add_header("Message-ID", messageid)
 376         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 377         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 378         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 379         if prevmessageid:
 380             msg.add_header("References", prevmessageid)
 381         createddate = datetime.datetime.now() \
 382             .strftime("%a, %e %b %Y %T -0000")
 383         try:
 384             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 385                 .strftime("%a, %e %b %Y %T -0000")
 386         except:
 387             pass
 388         msg.add_header("Date", createddate)
 389         msg.add_header("Subject", item["title"])
 390         msg.set_default_type("text/plain")
 391
 392         htmlcontent = content.encode("utf-8")
 393         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 394             content, \
 395             item["link"], \
 396             item["link"] )
 397         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 398         textparser = HTML2Text()
 399         textparser.feed(content.encode("utf-8"))
 400         textcontent = textparser.gettext()
 401         textcontent = "%s\n\nItem URL: %s" %( \
 402             textcontent, \
 403             item["link"] )
 404         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 405         msg.attach(textpart)
 406         msg.attach(htmlpart)
 407
 408         # start by working out the filename we should be writting to, we do
 409         # this following the normal maildir style rules
 410         fname = str(os.getpid()) \
 411             + "." + socket.gethostname() \
 412             + "." + "".join( \
 413                 [random.choice( \
 414                     string.ascii_letters + string.digits \
 415                     ) for a in range(0,10) \
 416                 ]) + "." \
 417             + datetime.datetime.now().strftime('%s')
 418         fn = os.path.join(maildir, "tmp", fname)
 419         fh = open(fn, "w")
 420         fh.write(msg.as_string())
 421         fh.close()
 422         # now move it in to the new directory
 423         newfn = os.path.join(maildir, "new", fname)
 424         os.link(fn, newfn)
 425         os.unlink(fn)
 426
 427         # now add to the database about the item
 428         if prevmessageid:
 429             messageid = prevmessageid + " " + messageid
 430         if item.has_key("guid") and item["guid"] != item["link"]:
 431             data = urllib.urlencode(( \
 432                 ("message-id", messageid), \
 433                 ("created", createddate), \
 434                 ("contentmd5", md5sum) \
 435                 ))
 436             db[url + "|" + item["guid"]] = data
 437             try:
 438                 data = db[url + "|" + item["link"]]
 439                 data = cgi.parse_qs(data)
 440                 newdata = urllib.urlencode(( \
 441                     ("message-id", messageid), \
 442                     ("created", data["created"][0]), \
 443                     ("contentmd5", data["contentmd5"][0]) \
 444                     ))
 445                 db[url + "|" + item["link"]] = newdata
 446             except:
 447                 db[url + "|" + item["link"]] = data
 448         else:
 449             data = urllib.urlencode(( \
 450                 ("message-id", messageid), \
 451                 ("created", createddate), \
 452                 ("contentmd5", md5sum) \
 453                 ))
 454             db[url + "|" + item["link"]] = data
 455
 456     if headers:
 457         data = []
 458         for header in headers:
 459             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 460                 data.append((header[0], header[1]))
 461         if len(data) > 0:
 462             data = urllib.urlencode(data)
 463             feeddb[url] = data
 464
 465     db.close()
 466     feeddb.close()
 467
 468 if __name__ == "__main__":
 469     # This only gets executed if we really called the program
 470     # first off, parse the command line arguments
 471
 472     oparser = OptionParser()
 473     oparser.add_option(
 474         "-c", "--conf", dest="conf",
 475         help="location of config file"
 476         )
 477     oparser.add_option(
 478         "-s", "--statedir", dest="statedir",
 479         help="location of directory to store state in"
 480         )
 481
 482     (options, args) = oparser.parse_args()
 483
 484     # check for the configfile
 485
 486     configfile = None
 487
 488     if options.conf != None:
 489         # does the file exist?
 490         try:
 491             os.stat(options.conf)
 492             configfile = options.conf
 493         except:
 494             # should exit here as the specified file doesn't exist
 495             sys.stderr.write( \
 496                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 497             sys.exit(2)
 498     else:
 499         # check through the default locations
 500         try:
 501             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 502             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 503         except:
 504             try:
 505                 os.stat("/etc/rss2maildir.conf")
 506                 configfile = "/etc/rss2maildir.conf"
 507             except:
 508                 sys.stderr.write("No config file found. Exiting.\n")
 509                 sys.exit(2)
 510
 511     # Right - if we've got this far, we've got a config file, now for the hard
 512     # bits...
 513
 514     scp = SafeConfigParser()
 515     scp.read(configfile)
 516
 517     maildir_root = "RSSMaildir"
 518     state_dir = "state"
 519
 520     if options.statedir != None:
 521         state_dir = options.statedir
 522         try:
 523             mode = os.stat(state_dir)[stat.ST_MODE]
 524             if not stat.S_ISDIR(mode):
 525                 sys.stderr.write( \
 526                     "State directory (%s) is not a directory\n" %(state_dir))
 527                 sys.exit(1)
 528         except:
 529             # try to make the directory
 530             try:
 531                 os.mkdir(state_dir)
 532             except:
 533                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 534                 sys.exit(1)
 535     elif scp.has_option("general", "state_dir"):
 536         new_state_dir = scp.get("general", "state_dir")
 537         try:
 538             mode = os.stat(state_dir)[stat.ST_MODE]
 539             if not stat.S_ISDIR(mode):
 540                 sys.stderr.write( \
 541                     "State directory (%s) is not a directory\n" %(state_dir))
 542                 sys.exit(1)
 543         except:
 544             # try to create it
 545             try:
 546                 os.mkdir(new_state_dir)
 547                 state_dir = new_state_dir
 548             except:
 549                 sys.stderr.write( \
 550                     "Couldn't create state directory %s\n" %(new_state_dir))
 551                 sys.exit(1)
 552     else:
 553         try:
 554             mode = os.stat(state_dir)[stat.ST_MODE]
 555             if not stat.S_ISDIR(mode):
 556                 sys.stderr.write( \
 557                     "State directory %s is not a directory\n" %(state_dir))
 558                 sys.exit(1)
 559         except:
 560             try:
 561                 os.mkdir(state_dir)
 562             except:
 563                 sys.stderr.write( \
 564                     "State directory %s could not be created\n" %(state_dir))
 565                 sys.exit(1)
 566
 567     if scp.has_option("general", "maildir_root"):
 568         maildir_root = scp.get("general", "maildir_root")
 569
 570     try:
 571         mode = os.stat(maildir_root)[stat.ST_MODE]
 572         if not stat.S_ISDIR(mode):
 573             sys.stderr.write( \
 574                 "Maildir Root %s is not a directory\n" \
 575                 %(maildir_root))
 576             sys.exit(1)
 577     except:
 578         try:
 579             os.mkdir(maildir_root)
 580         except:
 581             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 582                 %(maildir_root))
 583             sys.exit(1)
 584
 585     feeds = scp.sections()
 586     try:
 587         feeds.remove("general")
 588     except:
 589         pass
 590
 591     for section in feeds:
 592         # check if the directory exists
 593         maildir = None
 594         try:
 595             maildir = scp.get(section, "maildir")
 596         except:
 597             maildir = section
 598
 599         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 600         maildir = os.path.join(maildir_root, maildir)
 601
 602         try:
 603             exists = os.stat(maildir)
 604             if stat.S_ISDIR(exists[stat.ST_MODE]):
 605                 # check if there's a new, cur and tmp directory
 606                 try:
 607                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 608                 except:
 609                     os.mkdir(os.path.join(maildir, "cur"))
 610                     if not stat.S_ISDIR(mode):
 611                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 612                 try:
 613                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 614                 except:
 615                     os.mkdir(os.path.join(maildir, "tmp"))
 616                     if not stat.S_ISDIR(mode):
 617                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 618                 try:
 619                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 620                     if not stat.S_ISDIR(mode):
 621                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 622                 except:
 623                     os.mkdir(os.path.join(maildir, "new"))
 624             else:
 625                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 626         except:
 627             try:
 628                 os.mkdir(maildir)
 629             except:
 630                 sys.stderr.write("Couldn't create root maildir %s\n" \
 631                     %(maildir))
 632                 sys.exit(1)
 633             try:
 634                 os.mkdir(os.path.join(maildir, "new"))
 635                 os.mkdir(os.path.join(maildir, "cur"))
 636                 os.mkdir(os.path.join(maildir, "tmp"))
 637             except:
 638                 sys.stderr.write( \
 639                     "Couldn't create required maildir directories for %s\n" \
 640                     %(section,))
 641                 sys.exit(1)
 642
 643         # right - we've got the directories, we've got the section, we know the
 644         # url... lets play!
 645
 646         parse_and_deliver(maildir, section, state_dir)