rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 class HTML2Text(HTMLParser):
  50     entities = {
  51         "amp": "&",
  52         "lt": "<",
  53         "gt": ">",
  54         "pound": "£",
  55         "copy": "©",
  56         "apos": "'",
  57         "quot": "\"",
  58         "nbsp": " ",
  59         }
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             self.handle_br()
  92         elif tag.lower() == "blockquote":
  93             self.inblockquote = True
  94             self.text = self.text + u'\n'
  95         elif tag.lower() == "p":
  96             if self.text != "":
  97                 self.text = self.text + u'\n\n'
  98             if self.inparagraph:
  99                 self.text = self.text \
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 101             self.currentparagraph = u''
 102             self.inparagraph = True
 103         elif tag.lower() == "pre":
 104             self.text = self.text + "\n"
 105             self.inpre = True
 106             self.inparagraph = False
 107             self.inblockquote = False
 108         elif tag.lower() == "ul":
 109             self.item = u''
 110             self.inul = True
 111             self.text = self.text + "\n"
 112         elif tag.lower() == "li":
 113             if not self.initem:
 114                 self.initem = True
 115                 self.item = u''
 116             else:
 117                 self.text = self.text \
 118                     + u' * ' \
 119                     + u'\n   '.join([a.strip() for a in \
 120                         textwrap.wrap(self.item, 67)]) \
 121                     + u'\n'
 122                 self.item = u''
 123                 self.initem = True
 124
 125     def handle_startendtag(self, tag, attrs):
 126         if tag.lower() == "br":
 127             self.handle_br()
 128
 129     def handle_br(self):
 130             if self.inparagraph:
 131                 self.text = self.text \
 132                 + u'\n'.join( \
 133                     [a \
 134                         for a in textwrap.wrap( \
 135                             self.currentparagraph, 70) \
 136                     ] \
 137                 ) \
 138                 + u'\n'
 139                 self.currentparagraph = u''
 140             elif self.inblockquote:
 141                 self.text = self.text \
 142                     + u'\n> ' \
 143                     + u'\n> '.join( \
 144                         [a \
 145                             for a in textwrap.wrap( \
 146                                 self.blockquote.encode("utf-8") \
 147                                 , 68) \
 148                         ] \
 149                     ) \
 150                     + u'\n'
 151                 self.blockquote = u''
 152             else:
 153                 self.text = self.text + "\n"
 154
 155     def handle_endtag(self, tag):
 156         if tag.lower() == "h1":
 157             self.inheadingone = False
 158             self.text = self.text \
 159                 + u'\n\n' \
 160                 + self.headingtext.encode("utf-8") \
 161                 + u'\n' \
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 163             self.headingtext = u''
 164         elif tag.lower() == "h2":
 165             self.inheadingtwo = False
 166             self.text = self.text \
 167                 + u'\n\n' \
 168                 + self.headingtext.encode("utf-8") \
 169                 + u'\n' \
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 171             self.headingtext = u''
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 173             self.inotherheading = False
 174             self.text = self.text \
 175                 + u'\n\n' \
 176                 + self.headingtext.encode("utf-8") \
 177                 + u'\n' \
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 179             self.headingtext = u''
 180         elif tag.lower() == "p":
 181             self.text = self.text \
 182                 + u'\n'.join(textwrap.wrap( \
 183                     self.currentparagraph, 70) \
 184                 )
 185             self.inparagraph = False
 186             self.currentparagraph = u''
 187         elif tag.lower() == "blockquote":
 188             self.text = self.text \
 189                 + u'\n> ' \
 190                 + u'\n> '.join( \
 191                     [a.strip() \
 192                         for a in textwrap.wrap( \
 193                             self.blockquote, 68)] \
 194                     ) \
 195                 + u'\n'
 196             self.inblockquote = False
 197             self.blockquote = u''
 198         elif tag.lower() == "pre":
 199             self.inpre = False
 200         elif tag.lower() == "li":
 201             self.initem = False
 202             if self.item != u'':
 203                 self.text = self.text \
 204                     + u' * ' \
 205                     + u'\n   '.join( \
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 207                     + u'\n'
 208             self.item = u''
 209         elif tag.lower() == "ul":
 210             self.inul = False
 211
 212     def handle_data(self, data):
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 214             self.headingtext = self.headingtext \
 215                 + unicode(data, "utf-8").strip() \
 216                 + u' '
 217         elif self.inblockquote:
 218             self.blockquote = self.blockquote \
 219                 + unicode(data, "utf-8").strip() \
 220                 + u' '
 221         elif self.initem:
 222             self.item = self.item + unicode(data, "utf-8")
 223         elif self.inparagraph:
 224             self.currentparagraph = self.currentparagraph \
 225                 + unicode(data, "utf-8").strip() \
 226                 + u' '
 227         elif self.inpre:
 228             self.text = self.text + unicode(data, "utf-8")
 229         else:
 230             isallwhitespace = data.strip()
 231             if isallwhitespace != "" and self.text[-1] == "\n":
 232                 self.text = self.text + unicode(data, "utf-8").strip() + u' '
 233
 234     def handle_entityref(self, name):
 235         entity = name
 236         if HTML2Text.entities.has_key(name.lower()):
 237             entity = HTML2Text.entities[name.lower()]
 238         elif name[0] == "#":
 239             entity = unichr(int(name[1:]))
 240         else:
 241             entity = "&" + name + ";"
 242
 243         if self.inparagraph:
 244             self.currentparagraph = self.currentparagraph \
 245                 + unicode(entity, "utf-8")
 246         elif self.inblockquote:
 247             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 248         else:
 249             self.text = self.text + unicode(entity, "utf-8")
 250
 251     def gettext(self):
 252         data = self.text
 253         if self.inparagraph:
 254             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 255         if data[-1] != '\n':
 256             data = data + '\n'
 257         return data
 258
 259 def open_url(method, url):
 260     redirectcount = 0
 261     while redirectcount < 3:
 262         (type, rest) = urllib.splittype(url)
 263         (host, path) = urllib.splithost(rest)
 264         (host, port) = urllib.splitport(host)
 265         if port == None:
 266             port = 80
 267         try:
 268             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 269             conn.request(method, path)
 270             response = conn.getresponse()
 271             if response.status in [301, 302, 303, 307]:
 272                 headers = response.getheaders()
 273                 for header in headers:
 274                     if header[0] == "location":
 275                         url = header[1]
 276             elif response.status == 200:
 277                 return response
 278         except:
 279             pass
 280         redirectcount = redirectcount + 1
 281     return None
 282
 283 def parse_and_deliver(maildir, url, statedir):
 284     feedhandle = None
 285     headers = None
 286     # first check if we know about this feed already
 287     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 288     if feeddb.has_key(url):
 289         data = feeddb[url]
 290         data = cgi.parse_qs(data)
 291         response = open_url("HEAD", url)
 292         headers = None
 293         if response:
 294             headers = response.getheaders()
 295         ischanged = False
 296         try:
 297             for header in headers:
 298                 if header[0] == "content-length":
 299                     if header[1] != data["content-length"][0]:
 300                         ischanged = True
 301                 elif header[0] == "etag":
 302                     if header[1] != data["etag"][0]:
 303                         ischanged = True
 304                 elif header[0] == "last-modified":
 305                     if header[1] != data["last-modified"][0]:
 306                         ischanged = True
 307                 elif header[0] == "content-md5":
 308                     if header[1] != data["content-md5"][0]:
 309                         ischanged = True
 310         except:
 311             ischanged = True
 312         if ischanged:
 313             response = open_url("GET", url)
 314             if response != None:
 315                 headers = response.getheaders()
 316                 feedhandle = response
 317             else:
 318                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 319                 return
 320         else:
 321             return # don't need to do anything, nothings changed.
 322     else:
 323         response = open_url("GET", url)
 324         if response != None:
 325             headers = response.getheaders()
 326             feedhandle = response
 327         else:
 328             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 329             return
 330
 331     fp = feedparser.parse(feedhandle)
 332     db = dbm.open(os.path.join(statedir, "seen"), "c")
 333     for item in fp["items"]:
 334         # have we seen it before?
 335         # need to work out what the content is first...
 336
 337         if item.has_key("content"):
 338             content = item["content"][0]["value"]
 339         else:
 340             content = item["summary"]
 341
 342         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 343
 344         prevmessageid = None
 345
 346         # check if there's a guid too - if that exists and we match the md5,
 347         # return
 348         if item.has_key("guid"):
 349             if db.has_key(url + "|" + item["guid"]):
 350                 data = db[url + "|" + item["guid"]]
 351                 data = cgi.parse_qs(data)
 352                 if data["contentmd5"][0] == md5sum:
 353                     continue
 354
 355         if db.has_key(url + "|" + item["link"]):
 356             data = db[url + "|" + item["link"]]
 357             data = cgi.parse_qs(data)
 358             if data.has_key("message-id"):
 359                 prevmessageid = data["message-id"][0]
 360             if data["contentmd5"][0] == md5sum:
 361                 continue
 362
 363         try:
 364             author = item["author"]
 365         except:
 366             author = url
 367
 368         # create a basic email message
 369         msg = MIMEMultipart("alternative")
 370         messageid = "<" \
 371             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 372             + "." \
 373             + "".join( \
 374                 [random.choice( \
 375                     string.ascii_letters + string.digits \
 376                     ) for a in range(0,6) \
 377                 ]) + "@" + socket.gethostname() + ">"
 378         msg.add_header("Message-ID", messageid)
 379         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 380         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 381         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 382         if prevmessageid:
 383             msg.add_header("References", prevmessageid)
 384         createddate = datetime.datetime.now() \
 385             .strftime("%a, %e %b %Y %T -0000")
 386         try:
 387             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 388                 .strftime("%a, %e %b %Y %T -0000")
 389         except:
 390             pass
 391         msg.add_header("Date", createddate)
 392         msg.add_header("Subject", item["title"])
 393         msg.set_default_type("text/plain")
 394
 395         htmlcontent = content.encode("utf-8")
 396         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 397             content, \
 398             item["link"], \
 399             item["link"] )
 400         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 401         textparser = HTML2Text()
 402         textparser.feed(content.encode("utf-8"))
 403         textcontent = textparser.gettext()
 404         textcontent = "%s\n\nItem URL: %s" %( \
 405             textcontent, \
 406             item["link"] )
 407         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 408         msg.attach(textpart)
 409         msg.attach(htmlpart)
 410
 411         # start by working out the filename we should be writting to, we do
 412         # this following the normal maildir style rules
 413         fname = str(os.getpid()) \
 414             + "." + socket.gethostname() \
 415             + "." + "".join( \
 416                 [random.choice( \
 417                     string.ascii_letters + string.digits \
 418                     ) for a in range(0,10) \
 419                 ]) + "." \
 420             + datetime.datetime.now().strftime('%s')
 421         fn = os.path.join(maildir, "tmp", fname)
 422         fh = open(fn, "w")
 423         fh.write(msg.as_string())
 424         fh.close()
 425         # now move it in to the new directory
 426         newfn = os.path.join(maildir, "new", fname)
 427         os.link(fn, newfn)
 428         os.unlink(fn)
 429
 430         # now add to the database about the item
 431         if prevmessageid:
 432             messageid = prevmessageid + " " + messageid
 433         if item.has_key("guid") and item["guid"] != item["link"]:
 434             data = urllib.urlencode(( \
 435                 ("message-id", messageid), \
 436                 ("created", createddate), \
 437                 ("contentmd5", md5sum) \
 438                 ))
 439             db[url + "|" + item["guid"]] = data
 440             try:
 441                 data = db[url + "|" + item["link"]]
 442                 data = cgi.parse_qs(data)
 443                 newdata = urllib.urlencode(( \
 444                     ("message-id", messageid), \
 445                     ("created", data["created"][0]), \
 446                     ("contentmd5", data["contentmd5"][0]) \
 447                     ))
 448                 db[url + "|" + item["link"]] = newdata
 449             except:
 450                 db[url + "|" + item["link"]] = data
 451         else:
 452             data = urllib.urlencode(( \
 453                 ("message-id", messageid), \
 454                 ("created", createddate), \
 455                 ("contentmd5", md5sum) \
 456                 ))
 457             db[url + "|" + item["link"]] = data
 458
 459     if headers:
 460         data = []
 461         for header in headers:
 462             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 463                 data.append((header[0], header[1]))
 464         if len(data) > 0:
 465             data = urllib.urlencode(data)
 466             feeddb[url] = data
 467
 468     db.close()
 469     feeddb.close()
 470
 471 if __name__ == "__main__":
 472     # This only gets executed if we really called the program
 473     # first off, parse the command line arguments
 474
 475     oparser = OptionParser()
 476     oparser.add_option(
 477         "-c", "--conf", dest="conf",
 478         help="location of config file"
 479         )
 480     oparser.add_option(
 481         "-s", "--statedir", dest="statedir",
 482         help="location of directory to store state in"
 483         )
 484
 485     (options, args) = oparser.parse_args()
 486
 487     # check for the configfile
 488
 489     configfile = None
 490
 491     if options.conf != None:
 492         # does the file exist?
 493         try:
 494             os.stat(options.conf)
 495             configfile = options.conf
 496         except:
 497             # should exit here as the specified file doesn't exist
 498             sys.stderr.write( \
 499                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 500             sys.exit(2)
 501     else:
 502         # check through the default locations
 503         try:
 504             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 505             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 506         except:
 507             try:
 508                 os.stat("/etc/rss2maildir.conf")
 509                 configfile = "/etc/rss2maildir.conf"
 510             except:
 511                 sys.stderr.write("No config file found. Exiting.\n")
 512                 sys.exit(2)
 513
 514     # Right - if we've got this far, we've got a config file, now for the hard
 515     # bits...
 516
 517     scp = SafeConfigParser()
 518     scp.read(configfile)
 519
 520     maildir_root = "RSSMaildir"
 521     state_dir = "state"
 522
 523     if options.statedir != None:
 524         state_dir = options.statedir
 525         try:
 526             mode = os.stat(state_dir)[stat.ST_MODE]
 527             if not stat.S_ISDIR(mode):
 528                 sys.stderr.write( \
 529                     "State directory (%s) is not a directory\n" %(state_dir))
 530                 sys.exit(1)
 531         except:
 532             # try to make the directory
 533             try:
 534                 os.mkdir(state_dir)
 535             except:
 536                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 537                 sys.exit(1)
 538     elif scp.has_option("general", "state_dir"):
 539         new_state_dir = scp.get("general", "state_dir")
 540         try:
 541             mode = os.stat(state_dir)[stat.ST_MODE]
 542             if not stat.S_ISDIR(mode):
 543                 sys.stderr.write( \
 544                     "State directory (%s) is not a directory\n" %(state_dir))
 545                 sys.exit(1)
 546         except:
 547             # try to create it
 548             try:
 549                 os.mkdir(new_state_dir)
 550                 state_dir = new_state_dir
 551             except:
 552                 sys.stderr.write( \
 553                     "Couldn't create state directory %s\n" %(new_state_dir))
 554                 sys.exit(1)
 555     else:
 556         try:
 557             mode = os.stat(state_dir)[stat.ST_MODE]
 558             if not stat.S_ISDIR(mode):
 559                 sys.stderr.write( \
 560                     "State directory %s is not a directory\n" %(state_dir))
 561                 sys.exit(1)
 562         except:
 563             try:
 564                 os.mkdir(state_dir)
 565             except:
 566                 sys.stderr.write( \
 567                     "State directory %s could not be created\n" %(state_dir))
 568                 sys.exit(1)
 569
 570     if scp.has_option("general", "maildir_root"):
 571         maildir_root = scp.get("general", "maildir_root")
 572
 573     try:
 574         mode = os.stat(maildir_root)[stat.ST_MODE]
 575         if not stat.S_ISDIR(mode):
 576             sys.stderr.write( \
 577                 "Maildir Root %s is not a directory\n" \
 578                 %(maildir_root))
 579             sys.exit(1)
 580     except:
 581         try:
 582             os.mkdir(maildir_root)
 583         except:
 584             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 585                 %(maildir_root))
 586             sys.exit(1)
 587
 588     feeds = scp.sections()
 589     try:
 590         feeds.remove("general")
 591     except:
 592         pass
 593
 594     for section in feeds:
 595         # check if the directory exists
 596         maildir = None
 597         try:
 598             maildir = scp.get(section, "maildir")
 599         except:
 600             maildir = section
 601
 602         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 603         maildir = os.path.join(maildir_root, maildir)
 604
 605         try:
 606             exists = os.stat(maildir)
 607             if stat.S_ISDIR(exists[stat.ST_MODE]):
 608                 # check if there's a new, cur and tmp directory
 609                 try:
 610                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 611                 except:
 612                     os.mkdir(os.path.join(maildir, "cur"))
 613                     if not stat.S_ISDIR(mode):
 614                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 615                 try:
 616                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 617                 except:
 618                     os.mkdir(os.path.join(maildir, "tmp"))
 619                     if not stat.S_ISDIR(mode):
 620                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 621                 try:
 622                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 623                     if not stat.S_ISDIR(mode):
 624                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 625                 except:
 626                     os.mkdir(os.path.join(maildir, "new"))
 627             else:
 628                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 629         except:
 630             try:
 631                 os.mkdir(maildir)
 632             except:
 633                 sys.stderr.write("Couldn't create root maildir %s\n" \
 634                     %(maildir))
 635                 sys.exit(1)
 636             try:
 637                 os.mkdir(os.path.join(maildir, "new"))
 638                 os.mkdir(os.path.join(maildir, "cur"))
 639                 os.mkdir(os.path.join(maildir, "tmp"))
 640             except:
 641                 sys.stderr.write( \
 642                     "Couldn't create required maildir directories for %s\n" \
 643                     %(section,))
 644                 sys.exit(1)
 645
 646         # right - we've got the directories, we've got the section, we know the
 647         # url... lets play!
 648
 649         parse_and_deliver(maildir, section, state_dir)