4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
 
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
 
   7 # This program is free software: you can redistribute it and/or modify
 
   8 # it under the terms of the GNU General Public License as published by
 
   9 # the Free Software Foundation, either version 3 of the License, or
 
  10 # (at your option) any later version.
 
  12 # This program is distributed in the hope that it will be useful,
 
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
  15 # GNU General Public License for more details.
 
  17 # You should have received a copy of the GNU General Public License
 
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
  28 from email.MIMEMultipart import MIMEMultipart
 
  29 from email.MIMEText import MIMEText
 
  38 from optparse import OptionParser
 
  39 from ConfigParser import SafeConfigParser
 
  41 from base64 import b64encode
 
  47 from HTMLParser import HTMLParser
 
  49 class HTML2Text(HTMLParser):
 
  62         self.inheadingone = False
 
  63         self.inheadingtwo = False
 
  64         self.inotherheading = False
 
  65         self.inparagraph = True
 
  66         self.inblockquote = False
 
  69         self.currentparagraph = u''
 
  70         self.headingtext = u''
 
  76         HTMLParser.__init__(self)
 
  78     def handle_starttag(self, tag, attrs):
 
  79         if tag.lower() == "h1":
 
  80             self.inheadingone = True
 
  81             self.inparagraph = False
 
  82         elif tag.lower() == "h2":
 
  83             self.inheadingtwo = True
 
  84             self.inparagraph = False
 
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 
  86             self.inotherheading = True
 
  87             self.inparagraph = False
 
  88         elif tag.lower() == "a":
 
  90         elif tag.lower() == "br":
 
  92         elif tag.lower() == "blockquote":
 
  93             self.inblockquote = True
 
  94             self.text = self.text + u'\n'
 
  95         elif tag.lower() == "p":
 
  97                 self.text = self.text + u'\n\n'
 
  99                 self.text = self.text \
 
 100                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 
 101             self.currentparagraph = u''
 
 102             self.inparagraph = True
 
 103         elif tag.lower() == "pre":
 
 104             self.text = self.text + "\n"
 
 106             self.inparagraph = False
 
 107             self.inblockquote = False
 
 108         elif tag.lower() == "ul":
 
 111             self.text = self.text + "\n"
 
 112         elif tag.lower() == "li":
 
 117                 self.text = self.text \
 
 119                     + u'\n   '.join([a.strip() for a in \
 
 120                         textwrap.wrap(self.item, 67)]) \
 
 125     def handle_startendtag(self, tag, attrs):
 
 126         if tag.lower() == "br":
 
 131                 self.text = self.text \
 
 134                         for a in textwrap.wrap( \
 
 135                             self.currentparagraph, 70) \
 
 139                 self.currentparagraph = u''
 
 140             elif self.inblockquote:
 
 141                 self.text = self.text \
 
 145                             for a in textwrap.wrap( \
 
 146                                 self.blockquote.encode("utf-8") \
 
 151                 self.blockquote = u''
 
 153                 self.text = self.text + "\n"
 
 155     def handle_endtag(self, tag):
 
 156         if tag.lower() == "h1":
 
 157             self.inheadingone = False
 
 158             self.text = self.text \
 
 160                 + self.headingtext.encode("utf-8") \
 
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 
 163             self.headingtext = u''
 
 164         elif tag.lower() == "h2":
 
 165             self.inheadingtwo = False
 
 166             self.text = self.text \
 
 168                 + self.headingtext.encode("utf-8") \
 
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 
 171             self.headingtext = u''
 
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 
 173             self.inotherheading = False
 
 174             self.text = self.text \
 
 176                 + self.headingtext.encode("utf-8") \
 
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 
 179             self.headingtext = u''
 
 180         elif tag.lower() == "p":
 
 181             self.text = self.text \
 
 182                 + u'\n'.join(textwrap.wrap( \
 
 183                     self.currentparagraph, 70) \
 
 185             self.inparagraph = False
 
 186             self.currentparagraph = u''
 
 187         elif tag.lower() == "blockquote":
 
 188             self.text = self.text \
 
 192                         for a in textwrap.wrap( \
 
 193                             self.blockquote, 68)] \
 
 196             self.inblockquote = False
 
 197             self.blockquote = u''
 
 198         elif tag.lower() == "pre":
 
 200         elif tag.lower() == "li":
 
 203                 self.text = self.text \
 
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 
 209         elif tag.lower() == "ul":
 
 212     def handle_data(self, data):
 
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 
 214             self.headingtext = self.headingtext \
 
 215                 + unicode(data, "utf-8").strip() \
 
 217         elif self.inblockquote:
 
 218             self.blockquote = self.blockquote \
 
 219                 + unicode(data, "utf-8").strip() \
 
 222             self.item = self.item + unicode(data, "utf-8")
 
 223         elif self.inparagraph:
 
 224             self.currentparagraph = self.currentparagraph \
 
 225                 + unicode(data, "utf-8").strip() \
 
 228             self.text = self.text + unicode(data, "utf-8")
 
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 
 232     def handle_entityref(self, name):
 
 234         if HTML2Text.entities.has_key(name.lower()):
 
 235             entity = HTML2Text.entities[name.lower()]
 
 237             entity = unichr(int(name[1:]))
 
 239             entity = "&" + name + ";"
 
 242             self.currentparagraph = self.currentparagraph \
 
 243                 + unicode(entity, "utf-8")
 
 244         elif self.inblockquote:
 
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 
 247             self.text = self.text + unicode(entity, "utf-8")
 
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 
 257 def open_url(method, url):
 
 259     while redirectcount < 3:
 
 260         (type, rest) = urllib.splittype(url)
 
 261         (host, path) = urllib.splithost(rest)
 
 262         (host, port) = urllib.splitport(host)
 
 266             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 
 267             conn.request(method, path)
 
 268             response = conn.getresponse()
 
 269             if response.status in [301, 302, 303, 307]:
 
 270                 headers = response.getheaders()
 
 271                 for header in headers:
 
 272                     if header[0] == "location":
 
 274             elif response.status == 200:
 
 278         redirectcount = redirectcount + 1
 
 281 def parse_and_deliver(maildir, url, statedir):
 
 284     # first check if we know about this feed already
 
 285     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 
 286     if feeddb.has_key(url):
 
 288         data = cgi.parse_qs(data)
 
 289         response = open_url("HEAD", url)
 
 292             headers = response.getheaders()
 
 295             for header in headers:
 
 296                 if header[0] == "content-length":
 
 297                     if header[1] != data["content-length"][0]:
 
 299                 elif header[0] == "etag":
 
 300                     if header[1] != data["etag"][0]:
 
 302                 elif header[0] == "last-modified":
 
 303                     if header[1] != data["last-modified"][0]:
 
 305                 elif header[0] == "content-md5":
 
 306                     if header[1] != data["content-md5"][0]:
 
 311             response = open_url("GET", url)
 
 313                 headers = response.getheaders()
 
 314                 feedhandle = response
 
 316                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 319             return # don't need to do anything, nothings changed.
 
 321         response = open_url("GET", url)
 
 323             headers = response.getheaders()
 
 324             feedhandle = response
 
 326             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 329     fp = feedparser.parse(feedhandle)
 
 330     db = dbm.open(os.path.join(statedir, "seen"), "c")
 
 331     for item in fp["items"]:
 
 332         # have we seen it before?
 
 333         # need to work out what the content is first...
 
 335         if item.has_key("content"):
 
 336             content = item["content"][0]["value"]
 
 338             content = item["summary"]
 
 340         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 
 344         # check if there's a guid too - if that exists and we match the md5,
 
 346         if item.has_key("guid"):
 
 347             if db.has_key(url + "|" + item["guid"]):
 
 348                 data = db[url + "|" + item["guid"]]
 
 349                 data = cgi.parse_qs(data)
 
 350                 if data["contentmd5"][0] == md5sum:
 
 353         if db.has_key(url + "|" + item["link"]):
 
 354             data = db[url + "|" + item["link"]]
 
 355             data = cgi.parse_qs(data)
 
 356             if data.has_key("message-id"):
 
 357                 prevmessageid = data["message-id"][0]
 
 358             if data["contentmd5"][0] == md5sum:
 
 362             author = item["author"]
 
 366         # create a basic email message
 
 367         msg = MIMEMultipart("alternative")
 
 369             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 
 373                     string.ascii_letters + string.digits \
 
 374                     ) for a in range(0,6) \
 
 375                 ]) + "@" + socket.gethostname() + ">"
 
 376         msg.add_header("Message-ID", messageid)
 
 377         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 
 378         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 
 379         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 
 381             msg.add_header("References", prevmessageid)
 
 382         createddate = datetime.datetime.now() \
 
 383             .strftime("%a, %e %b %Y %T -0000")
 
 385             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 
 386                 .strftime("%a, %e %b %Y %T -0000")
 
 389         msg.add_header("Date", createddate)
 
 390         msg.add_header("Subject", item["title"])
 
 391         msg.set_default_type("text/plain")
 
 393         htmlcontent = content.encode("utf-8")
 
 394         htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
 
 398         htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
 
 399         textparser = HTML2Text()
 
 400         textparser.feed(content.encode("utf-8"))
 
 401         textcontent = textparser.gettext()
 
 402         textcontent = "%s\n\nItem URL: %s" %( \
 
 405         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 
 409         # start by working out the filename we should be writting to, we do
 
 410         # this following the normal maildir style rules
 
 411         fname = str(os.getpid()) \
 
 412             + "." + socket.gethostname() \
 
 415                     string.ascii_letters + string.digits \
 
 416                     ) for a in range(0,10) \
 
 418             + datetime.datetime.now().strftime('%s')
 
 419         fn = os.path.join(maildir, "tmp", fname)
 
 421         fh.write(msg.as_string())
 
 423         # now move it in to the new directory
 
 424         newfn = os.path.join(maildir, "new", fname)
 
 428         # now add to the database about the item
 
 430             messageid = prevmessageid + " " + messageid
 
 431         if item.has_key("guid") and item["guid"] != item["link"]:
 
 432             data = urllib.urlencode(( \
 
 433                 ("message-id", messageid), \
 
 434                 ("created", createddate), \
 
 435                 ("contentmd5", md5sum) \
 
 437             db[url + "|" + item["guid"]] = data
 
 439                 data = db[url + "|" + item["link"]]
 
 440                 data = cgi.parse_qs(data)
 
 441                 newdata = urllib.urlencode(( \
 
 442                     ("message-id", messageid), \
 
 443                     ("created", data["created"][0]), \
 
 444                     ("contentmd5", data["contentmd5"][0]) \
 
 446                 db[url + "|" + item["link"]] = newdata
 
 448                 db[url + "|" + item["link"]] = data
 
 450             data = urllib.urlencode(( \
 
 451                 ("message-id", messageid), \
 
 452                 ("created", createddate), \
 
 453                 ("contentmd5", md5sum) \
 
 455             db[url + "|" + item["link"]] = data
 
 459         for header in headers:
 
 460             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 
 461                 data.append((header[0], header[1]))
 
 463             data = urllib.urlencode(data)
 
 469 if __name__ == "__main__":
 
 470     # This only gets executed if we really called the program
 
 471     # first off, parse the command line arguments
 
 473     oparser = OptionParser()
 
 475         "-c", "--conf", dest="conf",
 
 476         help="location of config file"
 
 479         "-s", "--statedir", dest="statedir",
 
 480         help="location of directory to store state in"
 
 483     (options, args) = oparser.parse_args()
 
 485     # check for the configfile
 
 489     if options.conf != None:
 
 490         # does the file exist?
 
 492             os.stat(options.conf)
 
 493             configfile = options.conf
 
 495             # should exit here as the specified file doesn't exist
 
 497                 "Config file %s does not exist. Exiting.\n" %(options.conf,))
 
 500         # check through the default locations
 
 502             os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 
 503             configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 
 506                 os.stat("/etc/rss2maildir.conf")
 
 507                 configfile = "/etc/rss2maildir.conf"
 
 509                 sys.stderr.write("No config file found. Exiting.\n")
 
 512     # Right - if we've got this far, we've got a config file, now for the hard
 
 515     scp = SafeConfigParser()
 
 518     maildir_root = "RSSMaildir"
 
 521     if options.statedir != None:
 
 522         state_dir = options.statedir
 
 524             mode = os.stat(state_dir)[stat.ST_MODE]
 
 525             if not stat.S_ISDIR(mode):
 
 527                     "State directory (%s) is not a directory\n" %(state_dir))
 
 530             # try to make the directory
 
 534                 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 
 536     elif scp.has_option("general", "state_dir"):
 
 537         new_state_dir = scp.get("general", "state_dir")
 
 539             mode = os.stat(state_dir)[stat.ST_MODE]
 
 540             if not stat.S_ISDIR(mode):
 
 542                     "State directory (%s) is not a directory\n" %(state_dir))
 
 547                 os.mkdir(new_state_dir)
 
 548                 state_dir = new_state_dir
 
 551                     "Couldn't create state directory %s\n" %(new_state_dir))
 
 555             mode = os.stat(state_dir)[stat.ST_MODE]
 
 556             if not stat.S_ISDIR(mode):
 
 558                     "State directory %s is not a directory\n" %(state_dir))
 
 565                     "State directory %s could not be created\n" %(state_dir))
 
 568     if scp.has_option("general", "maildir_root"):
 
 569         maildir_root = scp.get("general", "maildir_root")
 
 572         mode = os.stat(maildir_root)[stat.ST_MODE]
 
 573         if not stat.S_ISDIR(mode):
 
 575                 "Maildir Root %s is not a directory\n" \
 
 580             os.mkdir(maildir_root)
 
 582             sys.stderr.write("Couldn't create Maildir Root %s\n" \
 
 586     feeds = scp.sections()
 
 588         feeds.remove("general")
 
 592     for section in feeds:
 
 593         # check if the directory exists
 
 596             maildir = scp.get(section, "maildir")
 
 600         maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 
 601         maildir = os.path.join(maildir_root, maildir)
 
 604             exists = os.stat(maildir)
 
 605             if stat.S_ISDIR(exists[stat.ST_MODE]):
 
 606                 # check if there's a new, cur and tmp directory
 
 608                     mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 
 610                     os.mkdir(os.path.join(maildir, "cur"))
 
 611                     if not stat.S_ISDIR(mode):
 
 612                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 614                     mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 
 616                     os.mkdir(os.path.join(maildir, "tmp"))
 
 617                     if not stat.S_ISDIR(mode):
 
 618                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 620                     mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 
 621                     if not stat.S_ISDIR(mode):
 
 622                         sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 624                     os.mkdir(os.path.join(maildir, "new"))
 
 626                 sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 631                 sys.stderr.write("Couldn't create root maildir %s\n" \
 
 635                 os.mkdir(os.path.join(maildir, "new"))
 
 636                 os.mkdir(os.path.join(maildir, "cur"))
 
 637                 os.mkdir(os.path.join(maildir, "tmp"))
 
 640                     "Couldn't create required maildir directories for %s\n" \
 
 644         # right - we've got the directories, we've got the section, we know the
 
 647         parse_and_deliver(maildir, section, state_dir)