4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
 
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
 
   7 # This program is free software: you can redistribute it and/or modify
 
   8 # it under the terms of the GNU General Public License as published by
 
   9 # the Free Software Foundation, either version 3 of the License, or
 
  10 # (at your option) any later version.
 
  12 # This program is distributed in the hope that it will be useful,
 
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
  15 # GNU General Public License for more details.
 
  17 # You should have received a copy of the GNU General Public License
 
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
  28 from email.MIMEMultipart import MIMEMultipart
 
  29 from email.MIMEText import MIMEText
 
  38 from optparse import OptionParser
 
  39 from ConfigParser import SafeConfigParser
 
  41 from base64 import b64encode
 
  47 from HTMLParser import HTMLParser
 
  49 class HTML2Text(HTMLParser):
 
 174     def __init__(self,textwidth=70):
 
 177         self.textwidth = textwidth
 
 180         self.ignorenodata = False
 
 184         HTMLParser.__init__(self)
 
 186     def handle_starttag(self, tag, attrs):
 
 187         tag_name = tag.lower()
 
 188         if tag_name in self.blockleveltags:
 
 189             # handle starting a new block - unless we're in a block element
 
 190             # that can contain other blocks, we'll assume that we want to close
 
 192             if len(self.opentags) > 1 and self.opentags[-1] == u'li':
 
 193                 self.handle_curdata()
 
 195             if tag_name == u'ol':
 
 196                 self.handle_curdata()
 
 197                 self.listcount.append(1)
 
 198                 self.listlevel = len(self.listcount) - 1
 
 200             if tag_name in self.liststarttags:
 
 201                 smallist = self.opentags[-3:-1]
 
 203                 for prev_listtag in smallist:
 
 204                     if prev_listtag in [u'dl', u'ol']:
 
 205                         self.indentlevel = self.indentlevel + 4
 
 207                     elif prev_listtag == u'ul':
 
 208                         self.indentlevel = self.indentlevel + 3
 
 211             if len(self.opentags) > 0:
 
 212                 self.handle_curdata()
 
 213                 if tag_name not in self.cancontainflow:
 
 215             self.opentags.append(tag_name)
 
 217             if tag_name == "span":
 
 221                 listcount = self.listcount[-1]
 
 225             if tag_name == u'dd' and len(self.opentags) > 1 \
 
 226                 and self.opentags[-1] == u'dt':
 
 227                 self.handle_curdata()
 
 229             elif tag_name == u'dt' and len(self.opentags) > 1 \
 
 230                 and self.opentags[-1] == u'dd':
 
 231                 self.handle_curdata()
 
 233             elif tag_name == u'a':
 
 235                     if attr[0].lower() == u'href':
 
 236                         self.urls.append(attr[1].decode('utf-8'))
 
 237                 self.curdata = self.curdata + u'`'
 
 238                 self.opentags.append(tag_name)
 
 240             elif tag_name == u'img':
 
 241                 self.handle_image(attrs)
 
 243             elif tag_name == u'br':
 
 247                 # we don't know the tag, so lets avoid handling it!
 
 250     def handle_startendtag(self, tag, attrs):
 
 251         if tag.lower() == u'br':
 
 253         elif tag.lower() == u'img':
 
 254             self.handle_image(attrs)
 
 258             self.handle_curdata()
 
 259             self.opentags.append(u'br')
 
 260             self.handle_curdata()
 
 263     def handle_image(self, attrs):
 
 268                 alt = attr[1].decode('utf-8')
 
 269             elif attr[0] == 'src':
 
 270                 url = attr[1].decode('utf-8')
 
 273                 if self.images.has_key(alt):
 
 274                     if self.images[alt]["url"] == url:
 
 275                         self.curdata = self.curdata \
 
 278                         while self.images.has_key(alt):
 
 280                         self.images[alt]["url"] = url
 
 281                         self.curdata = self.curdata \
 
 284                     self.images[alt] = {}
 
 285                     self.images[alt]["url"] = url
 
 286                     self.curdata = self.curdata \
 
 289                 if self.images.has_key(url):
 
 290                     self.curdata = self.curdata \
 
 293                     self.images[url] = {}
 
 294                     self.images[url]["url"] =url
 
 295                     self.curdata = self.curdata \
 
 298     def handle_curdata(self):
 
 300         if len(self.opentags) == 0:
 
 303         tag_thats_done = self.opentags[-1]
 
 305         if len(self.curdata) == 0:
 
 308         if tag_thats_done == u'br':
 
 309             if len(self.text) == 0 or self.text[-1] != '\n':
 
 310                 self.text = self.text + '\n'
 
 311                 self.ignorenodata = True
 
 314         if len(self.curdata.strip()) == 0:
 
 317         if tag_thats_done in self.blockleveltags:
 
 318             newlinerequired = self.text != u''
 
 319             if self.ignorenodata:
 
 320                 newlinerequired = False
 
 321             self.ignorenodata = False
 
 323                 if tag_thats_done in [u'dt', u'dd', u'li'] \
 
 324                     and len(self.text) > 1 \
 
 325                     and self.text[-1] != u'\n':
 
 326                         self.text = self.text + u'\n'
 
 327                 elif len(self.text) > 2 \
 
 328                     and self.text[-1] != u'\n' \
 
 329                     and self.text[-2] != u'\n':
 
 330                     self.text = self.text + u'\n\n'
 
 332         if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
 
 335             headingtext = " ".join(self.curdata.split())
 
 336             seperator = u'\n' + u' '*self.indentlevel
 
 337             headingtext = seperator.join( \
 
 340                     self.textwidth - self.indentlevel \
 
 344             if tag_thats_done == u'h2':
 
 346             elif tag_thats_done != u'h1':
 
 349             if u'\n' in headingtext:
 
 350                 underline = u' ' * self.indentlevel \
 
 351                     + underlinechar * (self.textwidth - self.indentlevel)
 
 353                 underline = u' ' * self.indentlevel \
 
 354                     + underlinechar * len(headingtext)
 
 355             self.text = self.text \
 
 356                 + headingtext + u'\n' \
 
 358         elif tag_thats_done in [u'p', u'div']:
 
 359             paragraph = unicode( \
 
 360                 " ".join(self.curdata.strip().encode("utf-8").split()), \
 
 362             seperator = u'\n' + u' ' * self.indentlevel
 
 363             self.text = self.text \
 
 364                 + u' ' * self.indentlevel \
 
 367                         paragraph, self.textwidth - self.indentlevel))
 
 368         elif tag_thats_done == "pre":
 
 369             self.text = self.text + unicode( \
 
 370                 self.curdata.encode("utf-8"), "utf-8")
 
 371         elif tag_thats_done == u'blockquote':
 
 373                 " ".join(self.curdata.encode("utf-8").strip().split()), \
 
 375             seperator = u'\n' + u' ' * self.indentlevel + u'> '
 
 376             if len(self.text) > 0 and self.text[-1] != u'\n':
 
 377                 self.text = self.text + u'\n'
 
 378             self.text = self.text \
 
 383                         self.textwidth - self.indentlevel - 2 \
 
 387         elif tag_thats_done == "li":
 
 388             item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
 
 389             if len(self.text) > 0 and self.text[-1] != u'\n':
 
 390                 self.text = self.text + u'\n'
 
 391             # work out if we're in an ol rather than a ul
 
 392             latesttags = self.opentags[-4:]
 
 395             for thing in latesttags:
 
 409                 listmarker = u' %2d. ' %(self.listcount[-1])
 
 410                 self.listcount[-1] = self.listcount[-1] + 1
 
 413                 + u' ' * self.indentlevel \
 
 415             self.text = self.text \
 
 416                 + u' ' * self.indentlevel \
 
 421                         self.textwidth - self.indentlevel - listindent \
 
 425         elif tag_thats_done == u'dt':
 
 426             definition = unicode(" ".join( \
 
 427                     self.curdata.encode("utf-8").strip().split()), \
 
 429             if len(self.text) > 0 and self.text[-1] != u'\n':
 
 430                 self.text = self.text + u'\n\n'
 
 431             elif len(self.text) > 1 and self.text[-2] != u'\n':
 
 432                 self.text = self.text + u'\n'
 
 433             definition = u' ' * self.indentlevel + definition + "::"
 
 434             indentstring = u'\n' + u' ' * (self.indentlevel + 1)
 
 435             self.text = self.text \
 
 437                     textwrap.wrap(definition, \
 
 438                         self.textwidth - self.indentlevel - 1))
 
 440         elif tag_thats_done == u'dd':
 
 441             definition = unicode(" ".join( \
 
 442                     self.curdata.encode("utf-8").strip().split()),
 
 444             if len(definition) > 0:
 
 445                 if len(self.text) > 0 and self.text[-1] != u'\n':
 
 446                     self.text = self.text + u'\n'
 
 447                 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
 
 448                 self.text = self.text \
 
 449                     + u' ' * (self.indentlevel + 4) \
 
 450                     + indentstring.join( \
 
 453                             self.textwidth - self.indentlevel - 4 \
 
 457         elif tag_thats_done == u'a':
 
 458             self.curdata = self.curdata + u'`__'
 
 460         elif tag_thats_done in self.liststarttags:
 
 463         if tag_thats_done in self.blockleveltags:
 
 466         self.ignorenodata = False
 
 468     def handle_endtag(self, tag):
 
 469         self.ignorenodata = False
 
 474             tagindex = self.opentags.index(tag)
 
 479         if tag in [u'br', u'img']:
 
 482         if tag in self.liststarttags:
 
 483             if tag in [u'ol', u'dl', u'ul']:
 
 484                 self.handle_curdata()
 
 485                 # find if there was a previous list level
 
 486                 smalllist = self.opentags[:-1]
 
 488                 for prev_listtag in smalllist:
 
 489                     if prev_listtag in [u'ol', u'dl']:
 
 490                         self.indentlevel = self.indentlevel - 4
 
 492                     elif prev_listtag == u'ul':
 
 493                         self.indentlevel = self.indentlevel - 3
 
 497             self.listcount = self.listcount[:-1]
 
 499         while tagindex < len(self.opentags) \
 
 500             and tag in self.opentags[tagindex+1:]:
 
 502                 tagindex = self.opentags.index(tag, tagindex+1)
 
 504                 # well, we don't want to do that then
 
 506         if tagindex != len(self.opentags) - 1:
 
 507             # Assuming the data was for the last opened tag first
 
 508             self.handle_curdata()
 
 509             # Now kill the list to be a slice before this tag was opened
 
 510             self.opentags = self.opentags[:tagindex + 1]
 
 512             self.handle_curdata()
 
 513             if self.opentags[-1] == tag:
 
 516     def handle_data(self, data):
 
 517         if len(self.opentags) == 0:
 
 518             self.opentags.append(u'p')
 
 519         self.curdata = self.curdata + data.decode("utf-8")
 
 521     def handle_entityref(self, name):
 
 523         if HTML2Text.entities.has_key(name):
 
 524             entity = HTML2Text.entities[name]
 
 526             entity = unichr(int(name[1:]))
 
 528             entity = "&" + name + ";"
 
 530         self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
 
 534         self.handle_curdata()
 
 535         if len(self.text) == 0 or self.text[-1] != u'\n':
 
 536             self.text = self.text + u'\n'
 
 538         if len(self.text) > 0:
 
 539             while len(self.text) > 1 and self.text[-1] == u'\n':
 
 540                 self.text = self.text[:-1]
 
 541             self.text = self.text + u'\n'
 
 542         if len(self.urls) > 0:
 
 543             self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
 
 545         if len(self.images.keys()) > 0:
 
 546             self.text = self.text + u'\n.. ' \
 
 548                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
 
 549                 for a in self.images.keys()]) + u'\n'
 
 553 def open_url(method, url):
 
 555     while redirectcount < 3:
 
 556         (type, rest) = urllib.splittype(url)
 
 557         (host, path) = urllib.splithost(rest)
 
 558         (host, port) = urllib.splitport(host)
 
 562             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 
 563             conn.request(method, path)
 
 564             response = conn.getresponse()
 
 565             if response.status in [301, 302, 303, 307]:
 
 566                 headers = response.getheaders()
 
 567                 for header in headers:
 
 568                     if header[0] == "location":
 
 570             elif response.status == 200:
 
 574         redirectcount = redirectcount + 1
 
 577 def parse_and_deliver(maildir, url, statedir):
 
 580     # first check if we know about this feed already
 
 581     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 
 582     if feeddb.has_key(url):
 
 584         data = cgi.parse_qs(data)
 
 585         response = open_url("HEAD", url)
 
 588             headers = response.getheaders()
 
 591             for header in headers:
 
 592                 if header[0] == "content-length":
 
 593                     if header[1] != data["content-length"][0]:
 
 595                 elif header[0] == "etag":
 
 596                     if header[1] != data["etag"][0]:
 
 598                 elif header[0] == "last-modified":
 
 599                     if header[1] != data["last-modified"][0]:
 
 601                 elif header[0] == "content-md5":
 
 602                     if header[1] != data["content-md5"][0]:
 
 607             response = open_url("GET", url)
 
 609                 headers = response.getheaders()
 
 610                 feedhandle = response
 
 612                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 615             return # don't need to do anything, nothings changed.
 
 617         response = open_url("GET", url)
 
 619             headers = response.getheaders()
 
 620             feedhandle = response
 
 622             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 625     fp = feedparser.parse(feedhandle)
 
 626     db = dbm.open(os.path.join(statedir, "seen"), "c")
 
 627     for item in fp["items"]:
 
 628         # have we seen it before?
 
 629         # need to work out what the content is first...
 
 631         if item.has_key("content"):
 
 632             content = item["content"][0]["value"]
 
 634             content = item["summary"]
 
 636         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 
 640         # check if there's a guid too - if that exists and we match the md5,
 
 642         if item.has_key("guid"):
 
 643             if db.has_key(url + "|" + item["guid"]):
 
 644                 data = db[url + "|" + item["guid"]]
 
 645                 data = cgi.parse_qs(data)
 
 646                 if data["contentmd5"][0] == md5sum:
 
 649         if db.has_key(url + "|" + item["link"]):
 
 650             data = db[url + "|" + item["link"]]
 
 651             data = cgi.parse_qs(data)
 
 652             if data.has_key("message-id"):
 
 653                 prevmessageid = data["message-id"][0]
 
 654             if data["contentmd5"][0] == md5sum:
 
 658             author = item["author"]
 
 662         # create a basic email message
 
 663         msg = MIMEMultipart("alternative")
 
 665             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 
 669                     string.ascii_letters + string.digits \
 
 670                     ) for a in range(0,6) \
 
 671                 ]) + "@" + socket.gethostname() + ">"
 
 672         msg.add_header("Message-ID", messageid)
 
 673         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 
 674         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 
 675         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 
 677             msg.add_header("References", prevmessageid)
 
 678         createddate = datetime.datetime.now() \
 
 679             .strftime("%a, %e %b %Y %T -0000")
 
 681             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 
 682                 .strftime("%a, %e %b %Y %T -0000")
 
 685         msg.add_header("Date", createddate)
 
 686         subj_gen = HTML2Text()
 
 687         subj_gen.feed(item["title"].encod("utf-8"))
 
 688         msg.add_header("Subject", subj_gen.gettext())
 
 689         msg.set_default_type("text/plain")
 
 691         htmlcontent = content.encode("utf-8")
 
 692         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 
 696         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 
 697         textparser = HTML2Text()
 
 698         textparser.feed(content.encode("utf-8"))
 
 699         textcontent = textparser.gettext()
 
 700         textcontent = "%s\n\nItem URL: %s" %( \
 
 703         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 
 707         # start by working out the filename we should be writting to, we do
 
 708         # this following the normal maildir style rules
 
 709         fname = str(os.getpid()) \
 
 710             + "." + socket.gethostname() \
 
 713                     string.ascii_letters + string.digits \
 
 714                     ) for a in range(0,10) \
 
 716             + datetime.datetime.now().strftime('%s')
 
 717         fn = os.path.join(maildir, "tmp", fname)
 
 719         fh.write(msg.as_string())
 
 721         # now move it in to the new directory
 
 722         newfn = os.path.join(maildir, "new", fname)
 
 726         # now add to the database about the item
 
 728             messageid = prevmessageid + " " + messageid
 
 729         if item.has_key("guid") and item["guid"] != item["link"]:
 
 730             data = urllib.urlencode(( \
 
 731                 ("message-id", messageid), \
 
 732                 ("created", createddate), \
 
 733                 ("contentmd5", md5sum) \
 
 735             db[url + "|" + item["guid"]] = data
 
 737                 data = db[url + "|" + item["link"]]
 
 738                 data = cgi.parse_qs(data)
 
 739                 newdata = urllib.urlencode(( \
 
 740                     ("message-id", messageid), \
 
 741                     ("created", data["created"][0]), \
 
 742                     ("contentmd5", data["contentmd5"][0]) \
 
 744                 db[url + "|" + item["link"]] = newdata
 
 746                 db[url + "|" + item["link"]] = data
 
 748             data = urllib.urlencode(( \
 
 749                 ("message-id", messageid), \
 
 750                 ("created", createddate), \
 
 751                 ("contentmd5", md5sum) \
 
 753             db[url + "|" + item["link"]] = data
 
 757         for header in headers:
 
 759                 ["content-md5", "etag", "last-modified", "content-length"]:
 
 760                 data.append((header[0], header[1]))
 
 762             data = urllib.urlencode(data)
 
 768 if __name__ == "__main__":
 
 769     # This only gets executed if we really called the program
 
 770     # first off, parse the command line arguments
 
 772     oparser = OptionParser()
 
 774         "-c", "--conf", dest="conf",
 
 775         help="location of config file"
 
 778         "-s", "--statedir", dest="statedir",
 
 779         help="location of directory to store state in"
 
 782     (options, args) = oparser.parse_args()
 
 784     # check for the configfile
 
 788     if options.conf != None:
 
 789         # does the file exist?
 
 791             os.stat(options.conf)
 
 792             configfile = options.conf
 
 794             # should exit here as the specified file doesn't exist
 
 796                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 
 799         # check through the default locations
 
 801             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 
 802             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 
 805                 os.stat("/etc/rss2maildir.conf")
 
 806                 configfile = "/etc/rss2maildir.conf"
 
 808                 sys.stderr.write("No config file found. Exiting.\n")
 
 811     # Right - if we've got this far, we've got a config file, now for the hard
 
 814     scp = SafeConfigParser()
 
 817     maildir_root = "RSSMaildir"
 
 820     if options.statedir != None:
 
 821         state_dir = options.statedir
 
 823             mode = os.stat(state_dir)[stat.ST_MODE]
 
 824             if not stat.S_ISDIR(mode):
 
 826                     "State directory (%s) is not a directory\n" %(state_dir))
 
 829             # try to make the directory
 
 833                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 
 835     elif scp.has_option("general", "state_dir"):
 
 836         new_state_dir = scp.get("general", "state_dir")
 
 838             mode = os.stat(new_state_dir)[stat.ST_MODE]
 
 839             if not stat.S_ISDIR(mode):
 
 841                     "State directory (%s) is not a directory\n" %(state_dir))
 
 844                 state_dir = new_state_dir
 
 848                 os.mkdir(new_state_dir)
 
 849                 state_dir = new_state_dir
 
 852                     "Couldn't create state directory %s\n" %(new_state_dir))
 
 856             mode = os.stat(state_dir)[stat.ST_MODE]
 
 857             if not stat.S_ISDIR(mode):
 
 859                     "State directory %s is not a directory\n" %(state_dir))
 
 866                     "State directory %s could not be created\n" %(state_dir))
 
 869     if scp.has_option("general", "maildir_root"):
 
 870         maildir_root = scp.get("general", "maildir_root")
 
 873         mode = os.stat(maildir_root)[stat.ST_MODE]
 
 874         if not stat.S_ISDIR(mode):
 
 876                 "Maildir Root %s is not a directory\n" \
 
 881             os.mkdir(maildir_root)
 
 883             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 
 887     feeds = scp.sections()
 
 889         feeds.remove("general")
 
 893     for section in feeds:
 
 894         # check if the directory exists
 
 897             maildir = scp.get(section, "maildir")
 
 901         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 
 902         maildir = os.path.join(maildir_root, maildir)
 
 905             exists = os.stat(maildir)
 
 906             if stat.S_ISDIR(exists[stat.ST_MODE]):
 
 907                 # check if there's a new, cur and tmp directory
 
 909                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 
 911                     os.mkdir(os.path.join(maildir, "cur"))
 
 912                     if not stat.S_ISDIR(mode):
 
 913                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 915                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 
 917                     os.mkdir(os.path.join(maildir, "tmp"))
 
 918                     if not stat.S_ISDIR(mode):
 
 919                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 921                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 
 922                     if not stat.S_ISDIR(mode):
 
 923                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 925                     os.mkdir(os.path.join(maildir, "new"))
 
 927                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 932                 sys.stderr.write("Couldn't create root maildir %s\n" \
 
 936                 os.mkdir(os.path.join(maildir, "new"))
 
 937                 os.mkdir(os.path.join(maildir, "cur"))
 
 938                 os.mkdir(os.path.join(maildir, "tmp"))
 
 941                     "Couldn't create required maildir directories for %s\n" \
 
 945         # right - we've got the directories, we've got the section, we know the
 
 948         parse_and_deliver(maildir, section, state_dir)