rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import httplib
  24 import urllib
  25
  26 import feedparser
  27
  28 from email.MIMEMultipart import MIMEMultipart
  29 from email.MIMEText import MIMEText
  30
  31 import datetime
  32 import random
  33 import string
  34 import textwrap
  35
  36 import socket
  37
  38 from optparse import OptionParser
  39 from ConfigParser import SafeConfigParser
  40
  41 from base64 import b64encode
  42 import md5
  43
  44 import cgi
  45 import dbm
  46
  47 from HTMLParser import HTMLParser
  48
  49 class HTML2Text(HTMLParser):
  50     entities = {
  51         "amp": "&",
  52         "lt": "<",
  53         "gt": ">",
  54         "pound": "£",
  55         "copy": "©",
  56         "apos": "'",
  57         "quot": "\"",
  58         "nbsp": " ",
  59         }
  60
  61     def __init__(self,textwidth=70):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         self.textwidth = textwidth
  77         HTMLParser.__init__(self)
  78
  79     def handle_starttag(self, tag, attrs):
  80         if tag.lower() == "h1":
  81             self.inheadingone = True
  82             self.inparagraph = False
  83         elif tag.lower() == "h2":
  84             self.inheadingtwo = True
  85             self.inparagraph = False
  86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  87             self.inotherheading = True
  88             self.inparagraph = False
  89         elif tag.lower() == "a":
  90             self.inlink = True
  91         elif tag.lower() == "br":
  92             self.handle_br()
  93         elif tag.lower() == "blockquote":
  94             self.inblockquote = True
  95             self.text = self.text + u'\n'
  96         elif tag.lower() == "p":
  97             if self.text != "":
  98                 self.text = self.text + u'\n\n'
  99             if self.inparagraph:
 100                 self.text = self.text \
 101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, self.textwidth))
 102             self.currentparagraph = u''
 103             self.inparagraph = True
 104         elif tag.lower() == "pre":
 105             self.text = self.text + "\n"
 106             self.inpre = True
 107             self.inparagraph = False
 108             self.inblockquote = False
 109         elif tag.lower() == "ul":
 110             self.item = u''
 111             self.inul = True
 112             self.text = self.text + "\n"
 113         elif tag.lower() == "li":
 114             if not self.initem:
 115                 self.initem = True
 116                 self.item = u''
 117             else:
 118                 self.text = self.text \
 119                     + u' * ' \
 120                     + u'\n   '.join([a.strip() for a in \
 121                         textwrap.wrap(self.item, self.textwidth - 3)]) \
 122                     + u'\n'
 123                 self.item = u''
 124                 self.initem = True
 125
 126     def handle_startendtag(self, tag, attrs):
 127         if tag.lower() == "br":
 128             self.handle_br()
 129
 130     def handle_br(self):
 131             if self.inparagraph:
 132                 self.text = self.text \
 133                 + u'\n'.join( \
 134                     [a \
 135                         for a in textwrap.wrap( \
 136                             self.currentparagraph, self.textwidth) \
 137                     ] \
 138                 ) \
 139                 + u'\n'
 140                 self.currentparagraph = u''
 141             elif self.inblockquote:
 142                 self.text = self.text \
 143                     + u'\n> ' \
 144                     + u'\n> '.join( \
 145                         [a \
 146                             for a in textwrap.wrap( \
 147                                 self.blockquote.encode("utf-8") \
 148                                 , 68) \
 149                         ] \
 150                     ) \
 151                     + u'\n'
 152                 self.blockquote = u''
 153             else:
 154                 self.text = self.text + "\n"
 155
 156     def handle_endtag(self, tag):
 157         if tag.lower() == "h1":
 158             self.inheadingone = False
 159             self.text = self.text \
 160                 + u'\n\n' \
 161                 + self.headingtext.encode("utf-8") \
 162                 + u'\n' \
 163                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 164             self.headingtext = u''
 165         elif tag.lower() == "h2":
 166             self.inheadingtwo = False
 167             self.text = self.text \
 168                 + u'\n\n' \
 169                 + self.headingtext.encode("utf-8") \
 170                 + u'\n' \
 171                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 172             self.headingtext = u''
 173         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 174             self.inotherheading = False
 175             self.text = self.text \
 176                 + u'\n\n' \
 177                 + self.headingtext.encode("utf-8") \
 178                 + u'\n' \
 179                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 180             self.headingtext = u''
 181         elif tag.lower() == "p":
 182             self.text = self.text \
 183                 + u'\n'.join(textwrap.wrap( \
 184                     self.currentparagraph, self.textwidth) \
 185                 )
 186             self.inparagraph = False
 187             self.currentparagraph = u''
 188         elif tag.lower() == "blockquote":
 189             self.text = self.text \
 190                 + u'\n> ' \
 191                 + u'\n> '.join( \
 192                     [a.strip() \
 193                         for a in textwrap.wrap( \
 194                             self.blockquote, self.textwidth - 2)] \
 195                     ) \
 196                 + u'\n'
 197             self.inblockquote = False
 198             self.blockquote = u''
 199         elif tag.lower() == "pre":
 200             self.inpre = False
 201         elif tag.lower() == "li":
 202             self.initem = False
 203             if self.item != u'':
 204                 self.text = self.text \
 205                     + u' * ' \
 206                     + u'\n   '.join( \
 207                         [a.strip() for a in textwrap.wrap(self.item, self.textwidth - 3)]) \
 208                     + u'\n'
 209             self.item = u''
 210         elif tag.lower() == "ul":
 211             self.inul = False
 212
 213     def handle_data(self, data):
 214         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 215             self.headingtext = self.headingtext \
 216                 + unicode(data, "utf-8").strip() \
 217                 + u' '
 218         elif self.inblockquote:
 219             self.blockquote = self.blockquote \
 220                 + unicode(data, "utf-8").strip() \
 221                 + u' '
 222         elif self.initem:
 223             self.item = self.item + unicode(data, "utf-8")
 224         elif self.inparagraph:
 225             self.currentparagraph = self.currentparagraph \
 226                 + unicode(data, "utf-8").strip() \
 227                 + u' '
 228         elif self.inpre:
 229             self.text = self.text + unicode(data, "utf-8")
 230         else:
 231             isallwhitespace = data.strip() == ""
 232             if not isallwhitespace:
 233                 self.text = self.text + unicode(data, "utf-8").strip() + u' '
 234
 235     def handle_entityref(self, name):
 236         entity = name
 237         if HTML2Text.entities.has_key(name.lower()):
 238             entity = HTML2Text.entities[name.lower()]
 239         elif name[0] == "#":
 240             entity = unichr(int(name[1:]))
 241         else:
 242             entity = "&" + name + ";"
 243
 244         if self.inparagraph:
 245             self.currentparagraph = self.currentparagraph \
 246                 + unicode(entity, "utf-8")
 247         elif self.inblockquote:
 248             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 249         else:
 250             self.text = self.text + unicode(entity, "utf-8")
 251
 252     def gettext(self):
 253         data = self.text
 254         if self.inparagraph:
 255             data = data + "\n".join(textwrap.wrap(self.currentparagraph, self.textwidth))
 256         if data[-1] != '\n':
 257             data = data + '\n'
 258         return data
 259
 260 def open_url(method, url):
 261     redirectcount = 0
 262     while redirectcount < 3:
 263         (type, rest) = urllib.splittype(url)
 264         (host, path) = urllib.splithost(rest)
 265         (host, port) = urllib.splitport(host)
 266         if port == None:
 267             port = 80
 268         try:
 269             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 270             conn.request(method, path)
 271             response = conn.getresponse()
 272             if response.status in [301, 302, 303, 307]:
 273                 headers = response.getheaders()
 274                 for header in headers:
 275                     if header[0] == "location":
 276                         url = header[1]
 277             elif response.status == 200:
 278                 return response
 279         except:
 280             pass
 281         redirectcount = redirectcount + 1
 282     return None
 283
 284 def parse_and_deliver(maildir, url, statedir):
 285     feedhandle = None
 286     headers = None
 287     # first check if we know about this feed already
 288     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 289     if feeddb.has_key(url):
 290         data = feeddb[url]
 291         data = cgi.parse_qs(data)
 292         response = open_url("HEAD", url)
 293         headers = None
 294         if response:
 295             headers = response.getheaders()
 296         ischanged = False
 297         try:
 298             for header in headers:
 299                 if header[0] == "content-length":
 300                     if header[1] != data["content-length"][0]:
 301                         ischanged = True
 302                 elif header[0] == "etag":
 303                     if header[1] != data["etag"][0]:
 304                         ischanged = True
 305                 elif header[0] == "last-modified":
 306                     if header[1] != data["last-modified"][0]:
 307                         ischanged = True
 308                 elif header[0] == "content-md5":
 309                     if header[1] != data["content-md5"][0]:
 310                         ischanged = True
 311         except:
 312             ischanged = True
 313         if ischanged:
 314             response = open_url("GET", url)
 315             if response != None:
 316                 headers = response.getheaders()
 317                 feedhandle = response
 318             else:
 319                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 320                 return
 321         else:
 322             return # don't need to do anything, nothings changed.
 323     else:
 324         response = open_url("GET", url)
 325         if response != None:
 326             headers = response.getheaders()
 327             feedhandle = response
 328         else:
 329             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 330             return
 331
 332     fp = feedparser.parse(feedhandle)
 333     db = dbm.open(os.path.join(statedir, "seen"), "c")
 334     for item in fp["items"]:
 335         # have we seen it before?
 336         # need to work out what the content is first...
 337
 338         if item.has_key("content"):
 339             content = item["content"][0]["value"]
 340         else:
 341             content = item["summary"]
 342
 343         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 344
 345         prevmessageid = None
 346
 347         # check if there's a guid too - if that exists and we match the md5,
 348         # return
 349         if item.has_key("guid"):
 350             if db.has_key(url + "|" + item["guid"]):
 351                 data = db[url + "|" + item["guid"]]
 352                 data = cgi.parse_qs(data)
 353                 if data["contentmd5"][0] == md5sum:
 354                     continue
 355
 356         if db.has_key(url + "|" + item["link"]):
 357             data = db[url + "|" + item["link"]]
 358             data = cgi.parse_qs(data)
 359             if data.has_key("message-id"):
 360                 prevmessageid = data["message-id"][0]
 361             if data["contentmd5"][0] == md5sum:
 362                 continue
 363
 364         try:
 365             author = item["author"]
 366         except:
 367             author = url
 368
 369         # create a basic email message
 370         msg = MIMEMultipart("alternative")
 371         messageid = "<" \
 372             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 373             + "." \
 374             + "".join( \
 375                 [random.choice( \
 376                     string.ascii_letters + string.digits \
 377                     ) for a in range(0,6) \
 378                 ]) + "@" + socket.gethostname() + ">"
 379         msg.add_header("Message-ID", messageid)
 380         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 381         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 382         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 383         if prevmessageid:
 384             msg.add_header("References", prevmessageid)
 385         createddate = datetime.datetime.now() \
 386             .strftime("%a, %e %b %Y %T -0000")
 387         try:
 388             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 389                 .strftime("%a, %e %b %Y %T -0000")
 390         except:
 391             pass
 392         msg.add_header("Date", createddate)
 393         msg.add_header("Subject", item["title"])
 394         msg.set_default_type("text/plain")
 395
 396         htmlcontent = content.encode("utf-8")
 397         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 398             content, \
 399             item["link"], \
 400             item["link"] )
 401         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 402         textparser = HTML2Text()
 403         textparser.feed(content.encode("utf-8"))
 404         textcontent = textparser.gettext()
 405         textcontent = "%s\n\nItem URL: %s" %( \
 406             textcontent, \
 407             item["link"] )
 408         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 409         msg.attach(textpart)
 410         msg.attach(htmlpart)
 411
 412         # start by working out the filename we should be writting to, we do
 413         # this following the normal maildir style rules
 414         fname = str(os.getpid()) \
 415             + "." + socket.gethostname() \
 416             + "." + "".join( \
 417                 [random.choice( \
 418                     string.ascii_letters + string.digits \
 419                     ) for a in range(0,10) \
 420                 ]) + "." \
 421             + datetime.datetime.now().strftime('%s')
 422         fn = os.path.join(maildir, "tmp", fname)
 423         fh = open(fn, "w")
 424         fh.write(msg.as_string())
 425         fh.close()
 426         # now move it in to the new directory
 427         newfn = os.path.join(maildir, "new", fname)
 428         os.link(fn, newfn)
 429         os.unlink(fn)
 430
 431         # now add to the database about the item
 432         if prevmessageid:
 433             messageid = prevmessageid + " " + messageid
 434         if item.has_key("guid") and item["guid"] != item["link"]:
 435             data = urllib.urlencode(( \
 436                 ("message-id", messageid), \
 437                 ("created", createddate), \
 438                 ("contentmd5", md5sum) \
 439                 ))
 440             db[url + "|" + item["guid"]] = data
 441             try:
 442                 data = db[url + "|" + item["link"]]
 443                 data = cgi.parse_qs(data)
 444                 newdata = urllib.urlencode(( \
 445                     ("message-id", messageid), \
 446                     ("created", data["created"][0]), \
 447                     ("contentmd5", data["contentmd5"][0]) \
 448                     ))
 449                 db[url + "|" + item["link"]] = newdata
 450             except:
 451                 db[url + "|" + item["link"]] = data
 452         else:
 453             data = urllib.urlencode(( \
 454                 ("message-id", messageid), \
 455                 ("created", createddate), \
 456                 ("contentmd5", md5sum) \
 457                 ))
 458             db[url + "|" + item["link"]] = data
 459
 460     if headers:
 461         data = []
 462         for header in headers:
 463             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 464                 data.append((header[0], header[1]))
 465         if len(data) > 0:
 466             data = urllib.urlencode(data)
 467             feeddb[url] = data
 468
 469     db.close()
 470     feeddb.close()
 471
 472 if __name__ == "__main__":
 473     # This only gets executed if we really called the program
 474     # first off, parse the command line arguments
 475
 476     oparser = OptionParser()
 477     oparser.add_option(
 478         "-c", "--conf", dest="conf",
 479         help="location of config file"
 480         )
 481     oparser.add_option(
 482         "-s", "--statedir", dest="statedir",
 483         help="location of directory to store state in"
 484         )
 485
 486     (options, args) = oparser.parse_args()
 487
 488     # check for the configfile
 489
 490     configfile = None
 491
 492     if options.conf != None:
 493         # does the file exist?
 494         try:
 495             os.stat(options.conf)
 496             configfile = options.conf
 497         except:
 498             # should exit here as the specified file doesn't exist
 499             sys.stderr.write( \
 500                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 501             sys.exit(2)
 502     else:
 503         # check through the default locations
 504         try:
 505             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 506             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 507         except:
 508             try:
 509                 os.stat("/etc/rss2maildir.conf")
 510                 configfile = "/etc/rss2maildir.conf"
 511             except:
 512                 sys.stderr.write("No config file found. Exiting.\n")
 513                 sys.exit(2)
 514
 515     # Right - if we've got this far, we've got a config file, now for the hard
 516     # bits...
 517
 518     scp = SafeConfigParser()
 519     scp.read(configfile)
 520
 521     maildir_root = "RSSMaildir"
 522     state_dir = "state"
 523
 524     if options.statedir != None:
 525         state_dir = options.statedir
 526         try:
 527             mode = os.stat(state_dir)[stat.ST_MODE]
 528             if not stat.S_ISDIR(mode):
 529                 sys.stderr.write( \
 530                     "State directory (%s) is not a directory\n" %(state_dir))
 531                 sys.exit(1)
 532         except:
 533             # try to make the directory
 534             try:
 535                 os.mkdir(state_dir)
 536             except:
 537                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 538                 sys.exit(1)
 539     elif scp.has_option("general", "state_dir"):
 540         new_state_dir = scp.get("general", "state_dir")
 541         try:
 542             mode = os.stat(state_dir)[stat.ST_MODE]
 543             if not stat.S_ISDIR(mode):
 544                 sys.stderr.write( \
 545                     "State directory (%s) is not a directory\n" %(state_dir))
 546                 sys.exit(1)
 547         except:
 548             # try to create it
 549             try:
 550                 os.mkdir(new_state_dir)
 551                 state_dir = new_state_dir
 552             except:
 553                 sys.stderr.write( \
 554                     "Couldn't create state directory %s\n" %(new_state_dir))
 555                 sys.exit(1)
 556     else:
 557         try:
 558             mode = os.stat(state_dir)[stat.ST_MODE]
 559             if not stat.S_ISDIR(mode):
 560                 sys.stderr.write( \
 561                     "State directory %s is not a directory\n" %(state_dir))
 562                 sys.exit(1)
 563         except:
 564             try:
 565                 os.mkdir(state_dir)
 566             except:
 567                 sys.stderr.write( \
 568                     "State directory %s could not be created\n" %(state_dir))
 569                 sys.exit(1)
 570
 571     if scp.has_option("general", "maildir_root"):
 572         maildir_root = scp.get("general", "maildir_root")
 573
 574     try:
 575         mode = os.stat(maildir_root)[stat.ST_MODE]
 576         if not stat.S_ISDIR(mode):
 577             sys.stderr.write( \
 578                 "Maildir Root %s is not a directory\n" \
 579                 %(maildir_root))
 580             sys.exit(1)
 581     except:
 582         try:
 583             os.mkdir(maildir_root)
 584         except:
 585             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 586                 %(maildir_root))
 587             sys.exit(1)
 588
 589     feeds = scp.sections()
 590     try:
 591         feeds.remove("general")
 592     except:
 593         pass
 594
 595     for section in feeds:
 596         # check if the directory exists
 597         maildir = None
 598         try:
 599             maildir = scp.get(section, "maildir")
 600         except:
 601             maildir = section
 602
 603         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 604         maildir = os.path.join(maildir_root, maildir)
 605
 606         try:
 607             exists = os.stat(maildir)
 608             if stat.S_ISDIR(exists[stat.ST_MODE]):
 609                 # check if there's a new, cur and tmp directory
 610                 try:
 611                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 612                 except:
 613                     os.mkdir(os.path.join(maildir, "cur"))
 614                     if not stat.S_ISDIR(mode):
 615                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 616                 try:
 617                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 618                 except:
 619                     os.mkdir(os.path.join(maildir, "tmp"))
 620                     if not stat.S_ISDIR(mode):
 621                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 622                 try:
 623                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 624                     if not stat.S_ISDIR(mode):
 625                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 626                 except:
 627                     os.mkdir(os.path.join(maildir, "new"))
 628             else:
 629                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 630         except:
 631             try:
 632                 os.mkdir(maildir)
 633             except:
 634                 sys.stderr.write("Couldn't create root maildir %s\n" \
 635                     %(maildir))
 636                 sys.exit(1)
 637             try:
 638                 os.mkdir(os.path.join(maildir, "new"))
 639                 os.mkdir(os.path.join(maildir, "cur"))
 640                 os.mkdir(os.path.join(maildir, "tmp"))
 641             except:
 642                 sys.stderr.write( \
 643                     "Couldn't create required maildir directories for %s\n" \
 644                     %(section,))
 645                 sys.exit(1)
 646
 647         # right - we've got the directories, we've got the section, we know the
 648         # url... lets play!
 649
 650         parse_and_deliver(maildir, section, state_dir)