4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
 
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
 
   7 # This program is free software: you can redistribute it and/or modify
 
   8 # it under the terms of the GNU General Public License as published by
 
   9 # the Free Software Foundation, either version 3 of the License, or
 
  10 # (at your option) any later version.
 
  12 # This program is distributed in the hope that it will be useful,
 
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
  15 # GNU General Public License for more details.
 
  17 # You should have received a copy of the GNU General Public License
 
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
  28 from email.MIMEMultipart import MIMEMultipart
 
  29 from email.MIMEText import MIMEText
 
  38 from optparse import OptionParser
 
  39 from ConfigParser import SafeConfigParser
 
  41 from base64 import b64encode
 
  47 from HTMLParser import HTMLParser
 
  60 class HTML2Text(HTMLParser):
 
  63         self.inheadingone = False
 
  64         self.inheadingtwo = False
 
  65         self.inotherheading = False
 
  66         self.inparagraph = True
 
  67         self.inblockquote = False
 
  70         self.currentparagraph = u''
 
  71         self.headingtext = u''
 
  77         HTMLParser.__init__(self)
 
  79     def handle_starttag(self, tag, attrs):
 
  80         if tag.lower() == "h1":
 
  81             self.inheadingone = True
 
  82             self.inparagraph = False
 
  83         elif tag.lower() == "h2":
 
  84             self.inheadingtwo = True
 
  85             self.inparagraph = False
 
  86         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 
  87             self.inotherheading = True
 
  88             self.inparagraph = False
 
  89         elif tag.lower() == "a":
 
  91         elif tag.lower() == "br":
 
  93         elif tag.lower() == "blockquote":
 
  94             self.inblockquote = True
 
  95             self.text = self.text + u'\n'
 
  96         elif tag.lower() == "p":
 
  98                 self.text = self.text + u'\n\n'
 
 100                 self.text = self.text \
 
 101                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 
 102             self.currentparagraph = u''
 
 103             self.inparagraph = True
 
 104         elif tag.lower() == "pre":
 
 105             self.text = self.text + "\n"
 
 107             self.inparagraph = False
 
 108             self.inblockquote = False
 
 109         elif tag.lower() == "ul":
 
 112             self.text = self.text + "\n"
 
 113         elif tag.lower() == "li" and self.inul:
 
 118                 self.text = self.text \
 
 120                     + u'\n   '.join([a.strip() for a in \
 
 121                         textwrap.wrap(self.item, 67)]) \
 
 125     def handle_startendtag(self, tag, attrs):
 
 126         if tag.lower() == "br":
 
 131                 self.text = self.text \
 
 134                         for a in textwrap.wrap( \
 
 135                             self.currentparagraph, 70) \
 
 139                 self.currentparagraph = u''
 
 140             elif self.inblockquote:
 
 141                 self.text = self.text \
 
 145                             for a in textwrap.wrap( \
 
 146                                 self.blockquote.encode("utf-8") \
 
 151                 self.blockquote = u''
 
 153                 self.text = self.text + "\n"
 
 155     def handle_endtag(self, tag):
 
 156         if tag.lower() == "h1":
 
 157             self.inheadingone = False
 
 158             self.text = self.text \
 
 160                 + self.headingtext.encode("utf-8") \
 
 162                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 
 163             self.headingtext = u''
 
 164         elif tag.lower() == "h2":
 
 165             self.inheadingtwo = False
 
 166             self.text = self.text \
 
 168                 + self.headingtext.encode("utf-8") \
 
 170                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 
 171             self.headingtext = u''
 
 172         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 
 173             self.inotherheading = False
 
 174             self.text = self.text \
 
 176                 + self.headingtext.encode("utf-8") \
 
 178                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 
 179             self.headingtext = u''
 
 180         elif tag.lower() == "p":
 
 181             self.text = self.text \
 
 182                 + u'\n'.join(textwrap.wrap( \
 
 183                     self.currentparagraph, 70) \
 
 185             self.inparagraph = False
 
 186             self.currentparagraph = u''
 
 187         elif tag.lower() == "blockquote":
 
 188             self.text = self.text \
 
 192                         for a in textwrap.wrap( \
 
 193                             self.blockquote, 68)] \
 
 196             self.inblockquote = False
 
 197             self.blockquote = u''
 
 198         elif tag.lower() == "pre":
 
 200         elif tag.lower() == "li":
 
 203                 self.text = self.text \
 
 206                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 
 209         elif tag.lower() == "ul":
 
 212     def handle_data(self, data):
 
 213         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 
 214             self.headingtext = self.headingtext \
 
 215                 + unicode(data, "utf-8").strip() \
 
 217         elif self.inblockquote:
 
 218             self.blockquote = self.blockquote \
 
 219                 + unicode(data, "utf-8").strip() \
 
 221         elif self.inparagraph:
 
 222             self.currentparagraph = self.currentparagraph \
 
 223                 + unicode(data, "utf-8").strip() \
 
 225         elif self.inul and self.initem:
 
 226             self.item = self.item + unicode(data, "utf-8")
 
 228             self.text = self.text + unicode(data, "utf-8")
 
 230             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 
 232     def handle_entityref(self, name):
 
 234         if entities.has_key(name.lower()):
 
 235             entity = entities[name.lower()]
 
 237             entity = unichr(int(name[1:]))
 
 239             entity = "&" + name + ";"
 
 242             self.currentparagraph = self.currentparagraph \
 
 243                 + unicode(entity, "utf-8")
 
 244         elif self.inblockquote:
 
 245             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 
 247             self.text = self.text + unicode(entity, "utf-8")
 
 252             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 
 255 def open_url(method, url):
 
 257     while redirectcount < 3:
 
 258         (type, rest) = urllib.splittype(url)
 
 259         (host, path) = urllib.splithost(rest)
 
 260         (host, port) = urllib.splitport(host)
 
 264             conn = httplib.HTTPConnection("%s:%s" %(host, port))
 
 265             conn.request(method, path)
 
 266             response = conn.getresponse()
 
 267             if response.status in [301, 302, 303, 307]:
 
 268                 headers = response.getheaders()
 
 269                 for header in headers:
 
 270                     if header[0] == "location":
 
 272             elif response.status == 200:
 
 276         redirectcount = redirectcount + 1
 
 279 def parse_and_deliver(maildir, url, statedir):
 
 282     # first check if we know about this feed already
 
 283     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 
 284     if feeddb.has_key(url):
 
 286         data = cgi.parse_qs(data)
 
 287         response = open_url("HEAD", url)
 
 290             headers = response.getheaders()
 
 293             for header in headers:
 
 294                 if header[0] == "content-length":
 
 295                     if header[1] != data["content-length"][0]:
 
 297                 elif header[0] == "etag":
 
 298                     if header[1] != data["etag"][0]:
 
 300                 elif header[0] == "last-modified":
 
 301                     if header[1] != data["last-modified"][0]:
 
 303                 elif header[0] == "content-md5":
 
 304                     if header[1] != data["content-md5"][0]:
 
 309             response = open_url("GET", url)
 
 311                 headers = response.getheaders()
 
 312                 feedhandle = response
 
 314                 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 317             return # don't need to do anything, nothings changed.
 
 319         response = open_url("GET", url)
 
 321             headers = response.getheaders()
 
 322             feedhandle = response
 
 324             sys.stderr.write("Failed to fetch feed: %s\n" %(url))
 
 327     fp = feedparser.parse(feedhandle)
 
 328     db = dbm.open(os.path.join(statedir, "seen"), "c")
 
 329     for item in fp["items"]:
 
 330         # have we seen it before?
 
 331         # need to work out what the content is first...
 
 333         if item.has_key("content"):
 
 334             content = item["content"][0]["value"]
 
 336             content = item["summary"]
 
 338         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 
 342         # check if there's a guid too - if that exists and we match the md5,
 
 344         if item.has_key("guid"):
 
 345             if db.has_key(url + "|" + item["guid"]):
 
 346                 data = db[url + "|" + item["guid"]]
 
 347                 data = cgi.parse_qs(data)
 
 348                 if data["contentmd5"][0] == md5sum:
 
 351         if db.has_key(url + "|" + item["link"]):
 
 352             data = db[url + "|" + item["link"]]
 
 353             data = cgi.parse_qs(data)
 
 354             if data.has_key("message-id"):
 
 355                 prevmessageid = data["message-id"][0]
 
 356             if data["contentmd5"][0] == md5sum:
 
 360             author = item["author"]
 
 364         # create a basic email message
 
 365         msg = MIMEMultipart("alternative")
 
 367             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 
 371                     string.ascii_letters + string.digits \
 
 372                     ) for a in range(0,6) \
 
 373                 ]) + "@" + socket.gethostname() + ">"
 
 374         msg.add_header("Message-ID", messageid)
 
 375         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 
 376         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 
 377         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 
 379             msg.add_header("References", prevmessageid)
 
 380         createddate = datetime.datetime.now() \
 
 381             .strftime("%a, %e %b %Y %T -0000")
 
 383             createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 
 384                 .strftime("%a, %e %b %Y %T -0000")
 
 387         msg.add_header("Date", createddate)
 
 388         msg.add_header("Subject", item["title"])
 
 389         msg.set_default_type("text/plain")
 
 391         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 
 392         textparser = HTML2Text()
 
 393         textparser.feed(content.encode("utf-8"))
 
 394         textcontent = textparser.gettext()
 
 395         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 
 399         # start by working out the filename we should be writting to, we do
 
 400         # this following the normal maildir style rules
 
 401         fname = str(os.getpid()) \
 
 402             + "." + socket.gethostname() \
 
 405                     string.ascii_letters + string.digits \
 
 406                     ) for a in range(0,10) \
 
 408             + datetime.datetime.now().strftime('%s')
 
 409         fn = os.path.join(maildir, "tmp", fname)
 
 411         fh.write(msg.as_string())
 
 413         # now move it in to the new directory
 
 414         newfn = os.path.join(maildir, "new", fname)
 
 418         # now add to the database about the item
 
 420             messageid = prevmessageid + " " + messageid
 
 421         if item.has_key("guid") and item["guid"] != item["link"]:
 
 422             data = urllib.urlencode(( \
 
 423                 ("message-id", messageid), \
 
 424                 ("created", createddate), \
 
 425                 ("contentmd5", md5sum) \
 
 427             db[url + "|" + item["guid"]] = data
 
 429                 data = db[url + "|" + item["link"]]
 
 430                 data = cgi.parse_qs(data)
 
 431                 newdata = urllib.urlencode(( \
 
 432                     ("message-id", messageid), \
 
 433                     ("created", data["created"][0]), \
 
 434                     ("contentmd5", data["contentmd5"][0]) \
 
 436                 db[url + "|" + item["link"]] = newdata
 
 438                 db[url + "|" + item["link"]] = data
 
 440             data = urllib.urlencode(( \
 
 441                 ("message-id", messageid), \
 
 442                 ("created", createddate), \
 
 443                 ("contentmd5", md5sum) \
 
 445             db[url + "|" + item["link"]] = data
 
 449         for header in headers:
 
 450             if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
 
 451                 data.append((header[0], header[1]))
 
 453             data = urllib.urlencode(data)
 
 459 # first off, parse the command line arguments
 
 461 oparser = OptionParser()
 
 463     "-c", "--conf", dest="conf",
 
 464     help="location of config file"
 
 467     "-s", "--statedir", dest="statedir",
 
 468     help="location of directory to store state in"
 
 471 (options, args) = oparser.parse_args()
 
 473 # check for the configfile
 
 477 if options.conf != None:
 
 478     # does the file exist?
 
 480         os.stat(options.conf)
 
 481         configfile = options.conf
 
 483         # should exit here as the specified file doesn't exist
 
 485             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 
 488     # check through the default locations
 
 490         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 
 491         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 
 494             os.stat("/etc/rss2maildir.conf")
 
 495             configfile = "/etc/rss2maildir.conf"
 
 497             sys.stderr.write("No config file found. Exiting.\n")
 
 500 # Right - if we've got this far, we've got a config file, now for the hard
 
 503 scp = SafeConfigParser()
 
 506 maildir_root = "RSSMaildir"
 
 509 if options.statedir != None:
 
 510     state_dir = options.statedir
 
 512         mode = os.stat(state_dir)[stat.ST_MODE]
 
 513         if not stat.S_ISDIR(mode):
 
 515                 "State directory (%s) is not a directory\n" %(state_dir))
 
 518         # try to make the directory
 
 522             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 
 524 elif scp.has_option("general", "state_dir"):
 
 525     new_state_dir = scp.get("general", "state_dir")
 
 527         mode = os.stat(state_dir)[stat.ST_MODE]
 
 528         if not stat.S_ISDIR(mode):
 
 530                 "State directory (%s) is not a directory\n" %(state_dir))
 
 535             os.mkdir(new_state_dir)
 
 536             state_dir = new_state_dir
 
 539                 "Couldn't create state directory %s\n" %(new_state_dir))
 
 543         mode = os.stat(state_dir)[stat.ST_MODE]
 
 544         if not stat.S_ISDIR(mode):
 
 546                 "State directory %s is not a directory\n" %(state_dir))
 
 553                 "State directory %s could not be created\n" %(state_dir))
 
 556 if scp.has_option("general", "maildir_root"):
 
 557     maildir_root = scp.get("general", "maildir_root")
 
 560     mode = os.stat(maildir_root)[stat.ST_MODE]
 
 561     if not stat.S_ISDIR(mode):
 
 563             "Maildir Root %s is not a directory\n" \
 
 568         os.mkdir(maildir_root)
 
 570         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 
 573 feeds = scp.sections()
 
 575     feeds.remove("general")
 
 579 for section in feeds:
 
 580     # check if the directory exists
 
 583         maildir = scp.get(section, "maildir")
 
 587     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 
 588     maildir = os.path.join(maildir_root, maildir)
 
 591         exists = os.stat(maildir)
 
 592         if stat.S_ISDIR(exists[stat.ST_MODE]):
 
 593             # check if there's a new, cur and tmp directory
 
 595                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 
 597                 os.mkdir(os.path.join(maildir, "cur"))
 
 598                 if not stat.S_ISDIR(mode):
 
 599                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 601                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 
 603                 os.mkdir(os.path.join(maildir, "tmp"))
 
 604                 if not stat.S_ISDIR(mode):
 
 605                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 607                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 
 608                 if not stat.S_ISDIR(mode):
 
 609                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 611                 os.mkdir(os.path.join(maildir, "new"))
 
 613             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 
 618             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 
 621             os.mkdir(os.path.join(maildir, "new"))
 
 622             os.mkdir(os.path.join(maildir, "cur"))
 
 623             os.mkdir(os.path.join(maildir, "tmp"))
 
 626                 "Couldn't create required maildir directories for %s\n" \
 
 630     # right - we've got the directories, we've got the section, we know the
 
 633     parse_and_deliver(maildir, section, state_dir)