rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 class HTML2Text(HTMLParser):
  50     entities = {
  51         "amp": "&",
  52         "lt": "<",
  53         "gt": ">",
  54         "pound": "£",
  55         "copy": "©",
  56         "apos": "'",
  57         "quot": "\"",
  58         "nbsp": " ",
  59         }
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             self.handle_br()
  92         elif tag.lower() == "blockquote":
  93             self.inblockquote = True
  94             self.text = self.text + u'\n'
  95         elif tag.lower() == "p":
  96             if self.text != "":
  97                 self.text = self.text + u'\n\n'
  98             if self.inparagraph:
  99                 self.text = self.text \
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 101             self.currentparagraph = u''
 102             self.inparagraph = True
 103         elif tag.lower() == "pre":
 104             self.text = self.text + "\n"
 105             self.inpre = True
 106             self.inparagraph = False
 107             self.inblockquote = False
 108         elif tag.lower() == "ul":
 109             self.item = u''
 110             self.inul = True
 111             self.text = self.text + "\n"
 112         elif tag.lower() == "li":
 113             if not self.initem:
 114                 self.initem = True
 115                 self.item = u''
 116             else:
 117                 self.text = self.text \
 118                     + u' * ' \
 119                     + u'\n   '.join([a.strip() for a in \
 120                         textwrap.wrap(self.item, 67)]) \
 121                     + u'\n'
 122                 self.item = u''
 123                 self.initem = True
 124
 125     def handle_startendtag(self, tag, attrs):
 126         if tag.lower() == "br":
 127             self.handle_br()
 128
 129     def handle_br(self):
 130             if self.inparagraph:
 131                 self.text = self.text \
 132                 + u'\n'.join( \
 133                     [a \
 134                         for a in textwrap.wrap( \
 135                             self.currentparagraph, 70) \
 136                     ] \
 137                 ) \
 138                 + u'\n'
 139                 self.currentparagraph = u''
 140             elif self.inblockquote:
 141                 self.text = self.text \
 142                     + u'\n> ' \
 143                     + u'\n> '.join( \
 144                         [a \
 145                             for a in textwrap.wrap( \
 146                                 self.blockquote.encode("utf-8") \
 147                                 , 68) \
 148                         ] \
 149                     ) \
 150                     + u'\n'
 151                 self.blockquote = u''
 152             else:
 153                 self.text = self.text + "\n"
 154
 155     def handle_endtag(self, tag):
 156         if tag.lower() == "h1":
 157             self.inheadingone = False
 158             self.text = self.text \
 159                 + u'\n\n' \
 160                 + self.headingtext.encode("utf-8") \
 161                 + u'\n' \
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 163             self.headingtext = u''
 164         elif tag.lower() == "h2":
 165             self.inheadingtwo = False
 166             self.text = self.text \
 167                 + u'\n\n' \
 168                 + self.headingtext.encode("utf-8") \
 169                 + u'\n' \
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 171             self.headingtext = u''
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 173             self.inotherheading = False
 174             self.text = self.text \
 175                 + u'\n\n' \
 176                 + self.headingtext.encode("utf-8") \
 177                 + u'\n' \
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 179             self.headingtext = u''
 180         elif tag.lower() == "p":
 181             self.text = self.text \
 182                 + u'\n'.join(textwrap.wrap( \
 183                     self.currentparagraph, 70) \
 184                 )
 185             self.inparagraph = False
 186             self.currentparagraph = u''
 187         elif tag.lower() == "blockquote":
 188             self.text = self.text \
 189                 + u'\n> ' \
 190                 + u'\n> '.join( \
 191                     [a.strip() \
 192                         for a in textwrap.wrap( \
 193                             self.blockquote, 68)] \
 194                     ) \
 195                 + u'\n'
 196             self.inblockquote = False
 197             self.blockquote = u''
 198         elif tag.lower() == "pre":
 199             self.inpre = False
 200         elif tag.lower() == "li":
 201             self.initem = False
 202             if self.item != u'':
 203                 self.text = self.text \
 204                     + u' * ' \
 205                     + u'\n   '.join( \
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 207                     + u'\n'
 208             self.item = u''
 209         elif tag.lower() == "ul":
 210             self.inul = False
 211
 212     def handle_data(self, data):
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 214             self.headingtext = self.headingtext \
 215                 + unicode(data, "utf-8").strip() \
 216                 + u' '
 217         elif self.inblockquote:
 218             self.blockquote = self.blockquote \
 219                 + unicode(data, "utf-8").strip() \
 220                 + u' '
 221         elif self.initem:
 222             self.item = self.item + unicode(data, "utf-8")
 223         elif self.inparagraph:
 224             self.currentparagraph = self.currentparagraph \
 225                 + unicode(data, "utf-8").strip() \
 226                 + u' '
 227         elif self.inpre:
 228             self.text = self.text + unicode(data, "utf-8")
 229         else:
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 231
 232     def handle_entityref(self, name):
 233         entity = name
 234         if HTML2Text.entities.has_key(name.lower()):
 235             entity = HTML2Text.entities[name.lower()]
 236         elif name[0] == "#":
 237             entity = unichr(int(name[1:]))
 238         else:
 239             entity = "&" + name + ";"
 240
 241         if self.inparagraph:
 242             self.currentparagraph = self.currentparagraph \
 243                 + unicode(entity, "utf-8")
 244         elif self.inblockquote:
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 246         else:
 247             self.text = self.text + unicode(entity, "utf-8")
 248
 249     def gettext(self):
 250         data = self.text
 251         if self.inparagraph:
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 253         if data[-1] != '\n':
 254             data = data + '\n'
 255         return data
 256
 257 def open_url(method, url):
 258     redirectcount = 0
 259     while redirectcount < 3:
 260         (type, rest) = urllib.splittype(url)
 261         (host, path) = urllib.splithost(rest)
 262         (host, port) = urllib.splitport(host)
 263         if port == None:
 264             port = 80
 265         try:
 266             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 267             conn.request(method, path)
 268             response = conn.getresponse()
 269             if response.status in [301, 302, 303, 307]:
 270                 headers = response.getheaders()
 271                 for header in headers:
 272                     if header[0] == "location":
 273                         url = header[1]
 274             elif response.status == 200:
 275                 return response
 276         except:
 277             pass
 278         redirectcount = redirectcount + 1
 279     return None
 280
 281 def parse_and_deliver(maildir, url, statedir):
 282     feedhandle = None
 283     headers = None
 284     # first check if we know about this feed already
 285     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 286     if feeddb.has_key(url):
 287         data = feeddb[url]
 288         data = cgi.parse_qs(data)
 289         response = open_url("HEAD", url)
 290         headers = None
 291         if response:
 292             headers = response.getheaders()
 293         ischanged = False
 294         try:
 295             for header in headers:
 296                 if header[0] == "content-length":
 297                     if header[1] != data["content-length"][0]:
 298                         ischanged = True
 299                 elif header[0] == "etag":
 300                     if header[1] != data["etag"][0]:
 301                         ischanged = True
 302                 elif header[0] == "last-modified":
 303                     if header[1] != data["last-modified"][0]:
 304                         ischanged = True
 305                 elif header[0] == "content-md5":
 306                     if header[1] != data["content-md5"][0]:
 307                         ischanged = True
 308         except:
 309             ischanged = True
 310         if ischanged:
 311             response = open_url("GET", url)
 312             if response != None:
 313                 headers = response.getheaders()
 314                 feedhandle = response
 315             else:
 316                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 317                 return
 318         else:
 319             return # don't need to do anything, nothings changed.
 320     else:
 321         response = open_url("GET", url)
 322         if response != None:
 323             headers = response.getheaders()
 324             feedhandle = response
 325         else:
 326             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 327             return
 328
 329     fp = feedparser.parse(feedhandle)
 330     db = dbm.open(os.path.join(statedir, "seen"), "c")
 331     for item in fp["items"]:
 332         # have we seen it before?
 333         # need to work out what the content is first...
 334
 335         if item.has_key("content"):
 336             content = item["content"][0]["value"]
 337         else:
 338             content = item["summary"]
 339
 340         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 341
 342         prevmessageid = None
 343
 344         # check if there's a guid too - if that exists and we match the md5,
 345         # return
 346         if item.has_key("guid"):
 347             if db.has_key(url + "|" + item["guid"]):
 348                 data = db[url + "|" + item["guid"]]
 349                 data = cgi.parse_qs(data)
 350                 if data["contentmd5"][0] == md5sum:
 351                     continue
 352
 353         if db.has_key(url + "|" + item["link"]):
 354             data = db[url + "|" + item["link"]]
 355             data = cgi.parse_qs(data)
 356             if data.has_key("message-id"):
 357                 prevmessageid = data["message-id"][0]
 358             if data["contentmd5"][0] == md5sum:
 359                 continue
 360
 361         try:
 362             author = item["author"]
 363         except:
 364             author = url
 365
 366         # create a basic email message
 367         msg = MIMEMultipart("alternative")
 368         messageid = "<" \
 369             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 370             + "." \
 371             + "".join( \
 372                 [random.choice( \
 373                     string.ascii_letters + string.digits \
 374                     ) for a in range(0,6) \
 375                 ]) + "@" + socket.gethostname() + ">"
 376         msg.add_header("Message-ID", messageid)
 377         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 378         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 379         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 380         if prevmessageid:
 381             msg.add_header("References", prevmessageid)
 382         createddate = datetime.datetime.now() \
 383             .strftime("%a, %e %b %Y %T -0000")
 384         try:
 385             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 386                 .strftime("%a, %e %b %Y %T -0000")
 387         except:
 388             pass
 389         msg.add_header("Date", createddate)
 390         msg.add_header("Subject", item["title"])
 391         msg.set_default_type("text/plain")
 392
 393         htmlcontent = content.encode("utf-8")
 394         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 395             content, \
 396             item["link"], \
 397             item["link"] )
 398         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 399         textparser = HTML2Text()
 400         textparser.feed(content.encode("utf-8"))
 401         textcontent = textparser.gettext()
 402         textcontent = "%s\n\nItem URL: %s" %( \
 403             textcontent, \
 404             item["link"] )
 405         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 406         msg.attach(textpart)
 407         msg.attach(htmlpart)
 408
 409         # start by working out the filename we should be writting to, we do
 410         # this following the normal maildir style rules
 411         fname = str(os.getpid()) \
 412             + "." + socket.gethostname() \
 413             + "." + "".join( \
 414                 [random.choice( \
 415                     string.ascii_letters + string.digits \
 416                     ) for a in range(0,10) \
 417                 ]) + "." \
 418             + datetime.datetime.now().strftime('%s')
 419         fn = os.path.join(maildir, "tmp", fname)
 420         fh = open(fn, "w")
 421         fh.write(msg.as_string())
 422         fh.close()
 423         # now move it in to the new directory
 424         newfn = os.path.join(maildir, "new", fname)
 425         os.link(fn, newfn)
 426         os.unlink(fn)
 427
 428         # now add to the database about the item
 429         if prevmessageid:
 430             messageid = prevmessageid + " " + messageid
 431         if item.has_key("guid") and item["guid"] != item["link"]:
 432             data = urllib.urlencode(( \
 433                 ("message-id", messageid), \
 434                 ("created", createddate), \
 435                 ("contentmd5", md5sum) \
 436                 ))
 437             db[url + "|" + item["guid"]] = data
 438             try:
 439                 data = db[url + "|" + item["link"]]
 440                 data = cgi.parse_qs(data)
 441                 newdata = urllib.urlencode(( \
 442                     ("message-id", messageid), \
 443                     ("created", data["created"][0]), \
 444                     ("contentmd5", data["contentmd5"][0]) \
 445                     ))
 446                 db[url + "|" + item["link"]] = newdata
 447             except:
 448                 db[url + "|" + item["link"]] = data
 449         else:
 450             data = urllib.urlencode(( \
 451                 ("message-id", messageid), \
 452                 ("created", createddate), \
 453                 ("contentmd5", md5sum) \
 454                 ))
 455             db[url + "|" + item["link"]] = data
 456
 457     if headers:
 458         data = []
 459         for header in headers:
 460             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 461                 data.append((header[0], header[1]))
 462         if len(data) > 0:
 463             data = urllib.urlencode(data)
 464             feeddb[url] = data
 465
 466     db.close()
 467     feeddb.close()
 468
 469 if __name__ == "__main__":
 470     # This only gets executed if we really called the program
 471     # first off, parse the command line arguments
 472
 473     oparser = OptionParser()
 474     oparser.add_option(
 475         "-c", "--conf", dest="conf",
 476         help="location of config file"
 477         )
 478     oparser.add_option(
 479         "-s", "--statedir", dest="statedir",
 480         help="location of directory to store state in"
 481         )
 482
 483     (options, args) = oparser.parse_args()
 484
 485     # check for the configfile
 486
 487     configfile = None
 488
 489     if options.conf != None:
 490         # does the file exist?
 491         try:
 492             os.stat(options.conf)
 493             configfile = options.conf
 494         except:
 495             # should exit here as the specified file doesn't exist
 496             sys.stderr.write( \
 497                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 498             sys.exit(2)
 499     else:
 500         # check through the default locations
 501         try:
 502             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 503             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 504         except:
 505             try:
 506                 os.stat("/etc/rss2maildir.conf")
 507                 configfile = "/etc/rss2maildir.conf"
 508             except:
 509                 sys.stderr.write("No config file found. Exiting.\n")
 510                 sys.exit(2)
 511
 512     # Right - if we've got this far, we've got a config file, now for the hard
 513     # bits...
 514
 515     scp = SafeConfigParser()
 516     scp.read(configfile)
 517
 518     maildir_root = "RSSMaildir"
 519     state_dir = "state"
 520
 521     if options.statedir != None:
 522         state_dir = options.statedir
 523         try:
 524             mode = os.stat(state_dir)[stat.ST_MODE]
 525             if not stat.S_ISDIR(mode):
 526                 sys.stderr.write( \
 527                     "State directory (%s) is not a directory\n" %(state_dir))
 528                 sys.exit(1)
 529         except:
 530             # try to make the directory
 531             try:
 532                 os.mkdir(state_dir)
 533             except:
 534                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 535                 sys.exit(1)
 536     elif scp.has_option("general", "state_dir"):
 537         new_state_dir = scp.get("general", "state_dir")
 538         try:
 539             mode = os.stat(state_dir)[stat.ST_MODE]
 540             if not stat.S_ISDIR(mode):
 541                 sys.stderr.write( \
 542                     "State directory (%s) is not a directory\n" %(state_dir))
 543                 sys.exit(1)
 544         except:
 545             # try to create it
 546             try:
 547                 os.mkdir(new_state_dir)
 548                 state_dir = new_state_dir
 549             except:
 550                 sys.stderr.write( \
 551                     "Couldn't create state directory %s\n" %(new_state_dir))
 552                 sys.exit(1)
 553     else:
 554         try:
 555             mode = os.stat(state_dir)[stat.ST_MODE]
 556             if not stat.S_ISDIR(mode):
 557                 sys.stderr.write( \
 558                     "State directory %s is not a directory\n" %(state_dir))
 559                 sys.exit(1)
 560         except:
 561             try:
 562                 os.mkdir(state_dir)
 563             except:
 564                 sys.stderr.write( \
 565                     "State directory %s could not be created\n" %(state_dir))
 566                 sys.exit(1)
 567
 568     if scp.has_option("general", "maildir_root"):
 569         maildir_root = scp.get("general", "maildir_root")
 570
 571     try:
 572         mode = os.stat(maildir_root)[stat.ST_MODE]
 573         if not stat.S_ISDIR(mode):
 574             sys.stderr.write( \
 575                 "Maildir Root %s is not a directory\n" \
 576                 %(maildir_root))
 577             sys.exit(1)
 578     except:
 579         try:
 580             os.mkdir(maildir_root)
 581         except:
 582             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 583                 %(maildir_root))
 584             sys.exit(1)
 585
 586     feeds = scp.sections()
 587     try:
 588         feeds.remove("general")
 589     except:
 590         pass
 591
 592     for section in feeds:
 593         # check if the directory exists
 594         maildir = None
 595         try:
 596             maildir = scp.get(section, "maildir")
 597         except:
 598             maildir = section
 599
 600         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 601         maildir = os.path.join(maildir_root, maildir)
 602
 603         try:
 604             exists = os.stat(maildir)
 605             if stat.S_ISDIR(exists[stat.ST_MODE]):
 606                 # check if there's a new, cur and tmp directory
 607                 try:
 608                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 609                 except:
 610                     os.mkdir(os.path.join(maildir, "cur"))
 611                     if not stat.S_ISDIR(mode):
 612                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 613                 try:
 614                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 615                 except:
 616                     os.mkdir(os.path.join(maildir, "tmp"))
 617                     if not stat.S_ISDIR(mode):
 618                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 619                 try:
 620                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 621                     if not stat.S_ISDIR(mode):
 622                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 623                 except:
 624                     os.mkdir(os.path.join(maildir, "new"))
 625             else:
 626                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 627         except:
 628             try:
 629                 os.mkdir(maildir)
 630             except:
 631                 sys.stderr.write("Couldn't create root maildir %s\n" \
 632                     %(maildir))
 633                 sys.exit(1)
 634             try:
 635                 os.mkdir(os.path.join(maildir, "new"))
 636                 os.mkdir(os.path.join(maildir, "cur"))
 637                 os.mkdir(os.path.join(maildir, "tmp"))
 638             except:
 639                 sys.stderr.write( \
 640                     "Couldn't create required maildir directories for %s\n" \
 641                     %(section,))
 642                 sys.exit(1)
 643
 644         # right - we've got the directories, we've got the section, we know the
 645         # url... lets play!
 646
 647         parse_and_deliver(maildir, section, state_dir)