rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 class HTML2Text(HTMLParser):
  50     entities = {
  51         "amp": "&",
  52         "lt": "<",
  53         "gt": ">",
  54         "pound": "£",
  55         "copy": "©",
  56         "apos": "'",
  57         "quot": "\"",
  58         "nbsp": " ",
  59         }
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             self.handle_br()
  92         elif tag.lower() == "blockquote":
  93             self.inblockquote = True
  94             self.text = self.text + u'\n'
  95         elif tag.lower() == "p":
  96             if self.text != "":
  97                 self.text = self.text + u'\n\n'
  98             if self.inparagraph:
  99                 self.text = self.text \
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 101             self.currentparagraph = u''
 102             self.inparagraph = True
 103         elif tag.lower() == "pre":
 104             self.text = self.text + "\n"
 105             self.inpre = True
 106             self.inparagraph = False
 107             self.inblockquote = False
 108         elif tag.lower() == "ul":
 109             self.item = u''
 110             self.inul = True
 111             self.text = self.text + "\n"
 112         elif tag.lower() == "li" and self.inul:
 113             if not self.initem:
 114                 self.initem = True
 115                 self.item = u''
 116             else:
 117                 self.text = self.text \
 118                     + u' * ' \
 119                     + u'\n   '.join([a.strip() for a in \
 120                         textwrap.wrap(self.item, 67)]) \
 121                     + u'\n'
 122                 self.item = u''
 123
 124     def handle_startendtag(self, tag, attrs):
 125         if tag.lower() == "br":
 126             self.handle_br()
 127
 128     def handle_br(self):
 129             if self.inparagraph:
 130                 self.text = self.text \
 131                 + u'\n'.join( \
 132                     [a \
 133                         for a in textwrap.wrap( \
 134                             self.currentparagraph, 70) \
 135                     ] \
 136                 ) \
 137                 + u'\n'
 138                 self.currentparagraph = u''
 139             elif self.inblockquote:
 140                 self.text = self.text \
 141                     + u'\n> ' \
 142                     + u'\n> '.join( \
 143                         [a \
 144                             for a in textwrap.wrap( \
 145                                 self.blockquote.encode("utf-8") \
 146                                 , 68) \
 147                         ] \
 148                     ) \
 149                     + u'\n'
 150                 self.blockquote = u''
 151             else:
 152                 self.text = self.text + "\n"
 153
 154     def handle_endtag(self, tag):
 155         if tag.lower() == "h1":
 156             self.inheadingone = False
 157             self.text = self.text \
 158                 + u'\n\n' \
 159                 + self.headingtext.encode("utf-8") \
 160                 + u'\n' \
 161                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 162             self.headingtext = u''
 163         elif tag.lower() == "h2":
 164             self.inheadingtwo = False
 165             self.text = self.text \
 166                 + u'\n\n' \
 167                 + self.headingtext.encode("utf-8") \
 168                 + u'\n' \
 169                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 170             self.headingtext = u''
 171         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 172             self.inotherheading = False
 173             self.text = self.text \
 174                 + u'\n\n' \
 175                 + self.headingtext.encode("utf-8") \
 176                 + u'\n' \
 177                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 178             self.headingtext = u''
 179         elif tag.lower() == "p":
 180             self.text = self.text \
 181                 + u'\n'.join(textwrap.wrap( \
 182                     self.currentparagraph, 70) \
 183                 )
 184             self.inparagraph = False
 185             self.currentparagraph = u''
 186         elif tag.lower() == "blockquote":
 187             self.text = self.text \
 188                 + u'\n> ' \
 189                 + u'\n> '.join( \
 190                     [a.strip() \
 191                         for a in textwrap.wrap( \
 192                             self.blockquote, 68)] \
 193                     ) \
 194                 + u'\n'
 195             self.inblockquote = False
 196             self.blockquote = u''
 197         elif tag.lower() == "pre":
 198             self.inpre = False
 199         elif tag.lower() == "li":
 200             self.initem = False
 201             if self.item != "":
 202                 self.text = self.text \
 203                     + u' * ' \
 204                     + u'\n   '.join( \
 205                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 206                     + u'\n'
 207             self.item = u''
 208         elif tag.lower() == "ul":
 209             self.inul = False
 210
 211     def handle_data(self, data):
 212         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 213             self.headingtext = self.headingtext \
 214                 + unicode(data, "utf-8").strip() \
 215                 + u' '
 216         elif self.inblockquote:
 217             self.blockquote = self.blockquote \
 218                 + unicode(data, "utf-8").strip() \
 219                 + u' '
 220         elif self.inparagraph:
 221             self.currentparagraph = self.currentparagraph \
 222                 + unicode(data, "utf-8").strip() \
 223                 + u' '
 224         elif self.inul and self.initem:
 225             self.item = self.item + unicode(data, "utf-8")
 226         elif self.inpre:
 227             self.text = self.text + unicode(data, "utf-8")
 228         else:
 229             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 230
 231     def handle_entityref(self, name):
 232         entity = name
 233         if HTML2Text.entities.has_key(name.lower()):
 234             entity = HTML2Text.entities[name.lower()]
 235         elif name[0] == "#":
 236             entity = unichr(int(name[1:]))
 237         else:
 238             entity = "&" + name + ";"
 239
 240         if self.inparagraph:
 241             self.currentparagraph = self.currentparagraph \
 242                 + unicode(entity, "utf-8")
 243         elif self.inblockquote:
 244             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 245         else:
 246             self.text = self.text + unicode(entity, "utf-8")
 247
 248     def gettext(self):
 249         data = self.text
 250         if self.inparagraph:
 251             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 252         return data
 253
 254 def open_url(method, url):
 255     redirectcount = 0
 256     while redirectcount < 3:
 257         (type, rest) = urllib.splittype(url)
 258         (host, path) = urllib.splithost(rest)
 259         (host, port) = urllib.splitport(host)
 260         if port == None:
 261             port = 80
 262         try:
 263             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 264             conn.request(method, path)
 265             response = conn.getresponse()
 266             if response.status in [301, 302, 303, 307]:
 267                 headers = response.getheaders()
 268                 for header in headers:
 269                     if header[0] == "location":
 270                         url = header[1]
 271             elif response.status == 200:
 272                 return response
 273         except:
 274             pass
 275         redirectcount = redirectcount + 1
 276     return None
 277
 278 def parse_and_deliver(maildir, url, statedir):
 279     feedhandle = None
 280     headers = None
 281     # first check if we know about this feed already
 282     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 283     if feeddb.has_key(url):
 284         data = feeddb[url]
 285         data = cgi.parse_qs(data)
 286         response = open_url("HEAD", url)
 287         headers = None
 288         if response:
 289             headers = response.getheaders()
 290         ischanged = False
 291         try:
 292             for header in headers:
 293                 if header[0] == "content-length":
 294                     if header[1] != data["content-length"][0]:
 295                         ischanged = True
 296                 elif header[0] == "etag":
 297                     if header[1] != data["etag"][0]:
 298                         ischanged = True
 299                 elif header[0] == "last-modified":
 300                     if header[1] != data["last-modified"][0]:
 301                         ischanged = True
 302                 elif header[0] == "content-md5":
 303                     if header[1] != data["content-md5"][0]:
 304                         ischanged = True
 305         except:
 306             ischanged = True
 307         if ischanged:
 308             response = open_url("GET", url)
 309             if response != None:
 310                 headers = response.getheaders()
 311                 feedhandle = response
 312             else:
 313                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 314                 return
 315         else:
 316             return # don't need to do anything, nothings changed.
 317     else:
 318         response = open_url("GET", url)
 319         if response != None:
 320             headers = response.getheaders()
 321             feedhandle = response
 322         else:
 323             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 324             return
 325
 326     fp = feedparser.parse(feedhandle)
 327     db = dbm.open(os.path.join(statedir, "seen"), "c")
 328     for item in fp["items"]:
 329         # have we seen it before?
 330         # need to work out what the content is first...
 331
 332         if item.has_key("content"):
 333             content = item["content"][0]["value"]
 334         else:
 335             content = item["summary"]
 336
 337         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 338
 339         prevmessageid = None
 340
 341         # check if there's a guid too - if that exists and we match the md5,
 342         # return
 343         if item.has_key("guid"):
 344             if db.has_key(url + "|" + item["guid"]):
 345                 data = db[url + "|" + item["guid"]]
 346                 data = cgi.parse_qs(data)
 347                 if data["contentmd5"][0] == md5sum:
 348                     continue
 349
 350         if db.has_key(url + "|" + item["link"]):
 351             data = db[url + "|" + item["link"]]
 352             data = cgi.parse_qs(data)
 353             if data.has_key("message-id"):
 354                 prevmessageid = data["message-id"][0]
 355             if data["contentmd5"][0] == md5sum:
 356                 continue
 357
 358         try:
 359             author = item["author"]
 360         except:
 361             author = url
 362
 363         # create a basic email message
 364         msg = MIMEMultipart("alternative")
 365         messageid = "<" \
 366             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 367             + "." \
 368             + "".join( \
 369                 [random.choice( \
 370                     string.ascii_letters + string.digits \
 371                     ) for a in range(0,6) \
 372                 ]) + "@" + socket.gethostname() + ">"
 373         msg.add_header("Message-ID", messageid)
 374         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 375         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 376         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 377         if prevmessageid:
 378             msg.add_header("References", prevmessageid)
 379         createddate = datetime.datetime.now() \
 380             .strftime("%a, %e %b %Y %T -0000")
 381         try:
 382             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 383                 .strftime("%a, %e %b %Y %T -0000")
 384         except:
 385             pass
 386         msg.add_header("Date", createddate)
 387         msg.add_header("Subject", item["title"])
 388         msg.set_default_type("text/plain")
 389
 390         htmlcontent = content.encode("utf-8")
 391         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 392             content, \
 393             item["link"], \
 394             item["link"] )
 395         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 396         textparser = HTML2Text()
 397         textparser.feed(content.encode("utf-8"))
 398         textcontent = textparser.gettext()
 399         textcontent = "%s\n\nItem URL: %s" %( \
 400             textcontent, \
 401             item["link"] )
 402         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 403         msg.attach(textpart)
 404         msg.attach(htmlpart)
 405
 406         # start by working out the filename we should be writting to, we do
 407         # this following the normal maildir style rules
 408         fname = str(os.getpid()) \
 409             + "." + socket.gethostname() \
 410             + "." + "".join( \
 411                 [random.choice( \
 412                     string.ascii_letters + string.digits \
 413                     ) for a in range(0,10) \
 414                 ]) + "." \
 415             + datetime.datetime.now().strftime('%s')
 416         fn = os.path.join(maildir, "tmp", fname)
 417         fh = open(fn, "w")
 418         fh.write(msg.as_string())
 419         fh.close()
 420         # now move it in to the new directory
 421         newfn = os.path.join(maildir, "new", fname)
 422         os.link(fn, newfn)
 423         os.unlink(fn)
 424
 425         # now add to the database about the item
 426         if prevmessageid:
 427             messageid = prevmessageid + " " + messageid
 428         if item.has_key("guid") and item["guid"] != item["link"]:
 429             data = urllib.urlencode(( \
 430                 ("message-id", messageid), \
 431                 ("created", createddate), \
 432                 ("contentmd5", md5sum) \
 433                 ))
 434             db[url + "|" + item["guid"]] = data
 435             try:
 436                 data = db[url + "|" + item["link"]]
 437                 data = cgi.parse_qs(data)
 438                 newdata = urllib.urlencode(( \
 439                     ("message-id", messageid), \
 440                     ("created", data["created"][0]), \
 441                     ("contentmd5", data["contentmd5"][0]) \
 442                     ))
 443                 db[url + "|" + item["link"]] = newdata
 444             except:
 445                 db[url + "|" + item["link"]] = data
 446         else:
 447             data = urllib.urlencode(( \
 448                 ("message-id", messageid), \
 449                 ("created", createddate), \
 450                 ("contentmd5", md5sum) \
 451                 ))
 452             db[url + "|" + item["link"]] = data
 453
 454     if headers:
 455         data = []
 456         for header in headers:
 457             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 458                 data.append((header[0], header[1]))
 459         if len(data) > 0:
 460             data = urllib.urlencode(data)
 461             feeddb[url] = data
 462
 463     db.close()
 464     feeddb.close()
 465
 466 if __name__ == "__main__":
 467     # This only gets executed if we really called the program
 468     # first off, parse the command line arguments
 469
 470     oparser = OptionParser()
 471     oparser.add_option(
 472         "-c", "--conf", dest="conf",
 473         help="location of config file"
 474         )
 475     oparser.add_option(
 476         "-s", "--statedir", dest="statedir",
 477         help="location of directory to store state in"
 478         )
 479
 480     (options, args) = oparser.parse_args()
 481
 482     # check for the configfile
 483
 484     configfile = None
 485
 486     if options.conf != None:
 487         # does the file exist?
 488         try:
 489             os.stat(options.conf)
 490             configfile = options.conf
 491         except:
 492             # should exit here as the specified file doesn't exist
 493             sys.stderr.write( \
 494                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 495             sys.exit(2)
 496     else:
 497         # check through the default locations
 498         try:
 499             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 500             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 501         except:
 502             try:
 503                 os.stat("/etc/rss2maildir.conf")
 504                 configfile = "/etc/rss2maildir.conf"
 505             except:
 506                 sys.stderr.write("No config file found. Exiting.\n")
 507                 sys.exit(2)
 508
 509     # Right - if we've got this far, we've got a config file, now for the hard
 510     # bits...
 511
 512     scp = SafeConfigParser()
 513     scp.read(configfile)
 514
 515     maildir_root = "RSSMaildir"
 516     state_dir = "state"
 517
 518     if options.statedir != None:
 519         state_dir = options.statedir
 520         try:
 521             mode = os.stat(state_dir)[stat.ST_MODE]
 522             if not stat.S_ISDIR(mode):
 523                 sys.stderr.write( \
 524                     "State directory (%s) is not a directory\n" %(state_dir))
 525                 sys.exit(1)
 526         except:
 527             # try to make the directory
 528             try:
 529                 os.mkdir(state_dir)
 530             except:
 531                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 532                 sys.exit(1)
 533     elif scp.has_option("general", "state_dir"):
 534         new_state_dir = scp.get("general", "state_dir")
 535         try:
 536             mode = os.stat(state_dir)[stat.ST_MODE]
 537             if not stat.S_ISDIR(mode):
 538                 sys.stderr.write( \
 539                     "State directory (%s) is not a directory\n" %(state_dir))
 540                 sys.exit(1)
 541         except:
 542             # try to create it
 543             try:
 544                 os.mkdir(new_state_dir)
 545                 state_dir = new_state_dir
 546             except:
 547                 sys.stderr.write( \
 548                     "Couldn't create state directory %s\n" %(new_state_dir))
 549                 sys.exit(1)
 550     else:
 551         try:
 552             mode = os.stat(state_dir)[stat.ST_MODE]
 553             if not stat.S_ISDIR(mode):
 554                 sys.stderr.write( \
 555                     "State directory %s is not a directory\n" %(state_dir))
 556                 sys.exit(1)
 557         except:
 558             try:
 559                 os.mkdir(state_dir)
 560             except:
 561                 sys.stderr.write( \
 562                     "State directory %s could not be created\n" %(state_dir))
 563                 sys.exit(1)
 564
 565     if scp.has_option("general", "maildir_root"):
 566         maildir_root = scp.get("general", "maildir_root")
 567
 568     try:
 569         mode = os.stat(maildir_root)[stat.ST_MODE]
 570         if not stat.S_ISDIR(mode):
 571             sys.stderr.write( \
 572                 "Maildir Root %s is not a directory\n" \
 573                 %(maildir_root))
 574             sys.exit(1)
 575     except:
 576         try:
 577             os.mkdir(maildir_root)
 578         except:
 579             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 580                 %(maildir_root))
 581             sys.exit(1)
 582
 583     feeds = scp.sections()
 584     try:
 585         feeds.remove("general")
 586     except:
 587         pass
 588
 589     for section in feeds:
 590         # check if the directory exists
 591         maildir = None
 592         try:
 593             maildir = scp.get(section, "maildir")
 594         except:
 595             maildir = section
 596
 597         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 598         maildir = os.path.join(maildir_root, maildir)
 599
 600         try:
 601             exists = os.stat(maildir)
 602             if stat.S_ISDIR(exists[stat.ST_MODE]):
 603                 # check if there's a new, cur and tmp directory
 604                 try:
 605                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 606                 except:
 607                     os.mkdir(os.path.join(maildir, "cur"))
 608                     if not stat.S_ISDIR(mode):
 609                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 610                 try:
 611                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 612                 except:
 613                     os.mkdir(os.path.join(maildir, "tmp"))
 614                     if not stat.S_ISDIR(mode):
 615                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 616                 try:
 617                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 618                     if not stat.S_ISDIR(mode):
 619                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 620                 except:
 621                     os.mkdir(os.path.join(maildir, "new"))
 622             else:
 623                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 624         except:
 625             try:
 626                 os.mkdir(maildir)
 627             except:
 628                 sys.stderr.write("Couldn't create root maildir %s\n" \
 629                     %(maildir))
 630                 sys.exit(1)
 631             try:
 632                 os.mkdir(os.path.join(maildir, "new"))
 633                 os.mkdir(os.path.join(maildir, "cur"))
 634                 os.mkdir(os.path.join(maildir, "tmp"))
 635             except:
 636                 sys.stderr.write( \
 637                     "Couldn't create required maildir directories for %s\n" \
 638                     %(section,))
 639                 sys.exit(1)
 640
 641         # right - we've got the directories, we've got the section, we know the
 642         # url... lets play!
 643
 644         parse_and_deliver(maildir, section, state_dir)