rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             if self.inparagraph:
  92                 self.text = self.text \
  93                     + u'\n'.join( \
  94                         textwrap.wrap(self.currentparagraph, 70)) \
  95                     + u'\n'
  96                 self.currentparagraph = ""
  97             elif self.inblockquote:
  98                 self.text = self.text \
  99                     + u'\n> ' \
 100                     + u'\n> '.join( \
 101                         [a.strip() \
 102                             for a in textwrap.wrap(self.blockquote, 68) \
 103                         ]) \
 104                     + u'\n'
 105                 self.blockquote = u''
 106             else:
 107                 self.text = self.text + u'\n'
 108         elif tag.lower() == "blockquote":
 109             self.inblockquote = True
 110             self.text = self.text + u'\n'
 111         elif tag.lower() == "p":
 112             if self.text != "":
 113                 self.text = self.text + u'\n\n'
 114             if self.inparagraph:
 115                 self.text = self.text \
 116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 117             self.currentparagraph = u''
 118             self.inparagraph = True
 119         elif tag.lower() == "pre":
 120             self.text = self.text + "\n"
 121             self.inpre = True
 122             self.inparagraph = False
 123             self.inblockquote = False
 124         elif tag.lower() == "ul":
 125             self.item = u''
 126             self.inul = True
 127             self.text = self.text + "\n"
 128         elif tag.lower() == "li" and self.inul:
 129             if not self.initem:
 130                 self.initem = True
 131                 self.item = u''
 132             else:
 133                 self.text = self.text \
 134                     + u' * ' \
 135                     + u'\n   '.join([a.strip() for a in \
 136                         textwrap.wrap(self.item, 67)]) \
 137                     + u'\n'
 138                 self.item = u''
 139
 140     def handle_startendtag(self, tag, attrs):
 141         if tag.lower() == "br":
 142             if self.inparagraph:
 143                 self.text = self.text \
 144                 + u'\n'.join( \
 145                     [a \
 146                         for a in textwrap.wrap( \
 147                             self.currentparagraph, 70) \
 148                     ] \
 149                 ) \
 150                 + u'\n'
 151                 self.currentparagraph = u''
 152             elif self.inblockquote:
 153                 self.text = self.text \
 154                     + u'\n> ' \
 155                     + u'\n> '.join( \
 156                         [a \
 157                             for a in textwrap.wrap( \
 158                                 self.blockquote.encode("utf-8") \
 159                                 , 68) \
 160                         ] \
 161                     ) \
 162                     + u'\n'
 163                 self.blockquote = u''
 164             else:
 165                 self.text = self.text + "\n"
 166
 167     def handle_endtag(self, tag):
 168         if tag.lower() == "h1":
 169             self.inheadingone = False
 170             self.text = self.text \
 171                 + u'\n\n' \
 172                 + self.headingtext.encode("utf-8") \
 173                 + u'\n' \
 174                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 175             self.headingtext = u''
 176         elif tag.lower() == "h2":
 177             self.inheadingtwo = False
 178             self.text = self.text \
 179                 + u'\n\n' \
 180                 + self.headingtext.encode("utf-8") \
 181                 + u'\n' \
 182                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 183             self.headingtext = u''
 184         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 185             self.inotherheading = False
 186             self.text = self.text \
 187                 + u'\n\n' \
 188                 + self.headingtext.encode("utf-8") \
 189                 + u'\n' \
 190                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 191             self.headingtext = u''
 192         elif tag.lower() == "p":
 193             self.text = self.text \
 194                 + u'\n'.join(textwrap.wrap( \
 195                     self.currentparagraph, 70) \
 196                 )
 197             self.inparagraph = False
 198             self.currentparagraph = u''
 199         elif tag.lower() == "blockquote":
 200             self.text = self.text \
 201                 + u'\n> ' \
 202                 + u'\n> '.join( \
 203                     [a.strip() \
 204                         for a in textwrap.wrap( \
 205                             self.blockquote, 68)] \
 206                     ) \
 207                 + u'\n'
 208             self.inblockquote = False
 209             self.blockquote = u''
 210         elif tag.lower() == "pre":
 211             self.inpre = False
 212         elif tag.lower() == "li":
 213             self.initem = False
 214             if self.item != "":
 215                 self.text = self.text \
 216                     + u' * ' \
 217                     + u'\n   '.join( \
 218                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 219                     + u'\n'
 220             self.item = u''
 221         elif tag.lower() == "ul":
 222             self.inul = False
 223
 224     def handle_data(self, data):
 225         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 226             self.headingtext = self.headingtext \
 227                 + unicode(data, "utf-8").strip() \
 228                 + u' '
 229         elif self.inblockquote:
 230             self.blockquote = self.blockquote \
 231                 + unicode(data, "utf-8").strip() \
 232                 + u' '
 233         elif self.inparagraph:
 234             self.currentparagraph = self.currentparagraph \
 235                 + unicode(data, "utf-8").strip() \
 236                 + u' '
 237         elif self.inul and self.initem:
 238             self.item = self.item + unicode(data, "utf-8")
 239         elif self.inpre:
 240             self.text = self.text + unicode(data, "utf-8")
 241         else:
 242             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 243
 244     def handle_entityref(self, name):
 245         entity = name
 246         if entities.has_key(name.lower()):
 247             entity = entities[name.lower()]
 248         elif name[0] == "#":
 249             entity = unichr(int(name[1:]))
 250         else:
 251             entity = "&" + name + ";"
 252
 253         if self.inparagraph:
 254             self.currentparagraph = self.currentparagraph \
 255                 + unicode(entity, "utf-8")
 256         elif self.inblockquote:
 257             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 258         else:
 259             self.text = self.text + unicode(entity, "utf-8")
 260
 261     def gettext(self):
 262         data = self.text
 263         if self.inparagraph:
 264             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 265         return data
 266
 267 def parse_and_deliver(maildir, url, statedir):
 268     fp = feedparser.parse(url)
 269     db = dbm.open(os.path.join(statedir, "seen"), "c")
 270     for item in fp["items"]:
 271         # have we seen it before?
 272         # need to work out what the content is first...
 273
 274         if item.has_key("content"):
 275             content = item["content"][0]["value"]
 276         else:
 277             content = item["summary"]
 278
 279         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 280
 281         prevmessageid = None
 282
 283         if db.has_key(url + "|" + item["link"]):
 284             data = db[url + "|" + item["link"]]
 285             data = cgi.parse_qs(data)
 286             if data.has_key("message-id"):
 287                 prevmessageid = data["message-id"][0]
 288             if data["contentmd5"][0] == md5sum:
 289                 continue
 290
 291         try:
 292             author = item["author"]
 293         except:
 294             author = url
 295
 296         # create a basic email message
 297         msg = MIMEMultipart("alternative")
 298         messageid = "<" \
 299             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 300             + "." \
 301             + "".join( \
 302                 [random.choice( \
 303                     string.ascii_letters + string.digits \
 304                     ) for a in range(0,6) \
 305                 ]) + "@" + socket.gethostname() + ">"
 306         msg.add_header("Message-ID", messageid)
 307         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 308         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 309         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 310         if prevmessageid:
 311             msg.add_header("References", prevmessageid)
 312         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 313             .strftime("%a, %e %b %Y %T -0000")
 314         msg.add_header("Date", createddate)
 315         msg.add_header("Subject", item["title"])
 316         msg.set_default_type("text/plain")
 317
 318         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 319         textparser = HTML2Text()
 320         textparser.feed(content.encode("utf-8"))
 321         textcontent = textparser.gettext()
 322         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 323         msg.attach(textpart)
 324         msg.attach(htmlpart)
 325
 326         # start by working out the filename we should be writting to, we do
 327         # this following the normal maildir style rules
 328         fname = str(os.getpid()) \
 329             + "." + socket.gethostname() \
 330             + "." + "".join( \
 331                 [random.choice( \
 332                     string.ascii_letters + string.digits \
 333                     ) for a in range(0,10) \
 334                 ]) + "." \
 335             + datetime.datetime.now().strftime('%s')
 336         fn = os.path.join(maildir, "tmp", fname)
 337         fh = open(fn, "w")
 338         fh.write(msg.as_string())
 339         fh.close()
 340         # now move it in to the new directory
 341         newfn = os.path.join(maildir, "new", fname)
 342         os.link(fn, newfn)
 343         os.unlink(fn)
 344
 345         # now add to the database about the item
 346         if prevmessageid:
 347             messageid = prevmessageid + " " + messageid
 348         data = urllib.urlencode((
 349             ("message-id", messageid), \
 350             ("created", createddate), \
 351             ("contentmd5", md5sum) \
 352             ))
 353         db[url + "|" + item["link"]] = data
 354
 355     db.close()
 356
 357 # first off, parse the command line arguments
 358
 359 oparser = OptionParser()
 360 oparser.add_option(
 361     "-c", "--conf", dest="conf",
 362     help="location of config file"
 363     )
 364 oparser.add_option(
 365     "-s", "--statedir", dest="statedir",
 366     help="location of directory to store state in"
 367     )
 368
 369 (options, args) = oparser.parse_args()
 370
 371 # check for the configfile
 372
 373 configfile = None
 374
 375 if options.conf != None:
 376     # does the file exist?
 377     try:
 378         os.stat(options.conf)
 379         configfile = options.conf
 380     except:
 381         # should exit here as the specified file doesn't exist
 382         sys.stderr.write( \
 383             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 384         sys.exit(2)
 385 else:
 386     # check through the default locations
 387     try:
 388         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 389         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 390     except:
 391         try:
 392             os.stat("/etc/rss2maildir.conf")
 393             configfile = "/etc/rss2maildir.conf"
 394         except:
 395             sys.stderr.write("No config file found. Exiting.\n")
 396             sys.exit(2)
 397
 398 # Right - if we've got this far, we've got a config file, now for the hard
 399 # bits...
 400
 401 scp = SafeConfigParser()
 402 scp.read(configfile)
 403
 404 maildir_root = "RSSMaildir"
 405 state_dir = "state"
 406
 407 if options.statedir != None:
 408     state_dir = options.statedir
 409     try:
 410         mode = os.stat(state_dir)[stat.ST_MODE]
 411         if not stat.S_ISDIR(mode):
 412             sys.stderr.write( \
 413                 "State directory (%s) is not a directory\n" %(state_dir))
 414             sys.exit(1)
 415     except:
 416         # try to make the directory
 417         try:
 418             os.mkdir(state_dir)
 419         except:
 420             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 421             sys.exit(1)
 422 elif scp.has_option("general", "state_dir"):
 423     new_state_dir = scp.get("general", "state_dir")
 424     try:
 425         mode = os.stat(state_dir)[stat.ST_MODE]
 426         if not stat.S_ISDIR(mode):
 427             sys.stderr.write( \
 428                 "State directory (%s) is not a directory\n" %(state_dir))
 429             sys.exit(1)
 430     except:
 431         # try to create it
 432         try:
 433             os.mkdir(new_state_dir)
 434             state_dir = new_state_dir
 435         except:
 436             sys.stderr.write( \
 437                 "Couldn't create state directory %s\n" %(new_state_dir))
 438             sys.exit(1)
 439 else:
 440     try:
 441         mode = os.stat(state_dir)[stat.ST_MODE]
 442         if not stat.S_ISDIR(mode):
 443             sys.stderr.write( \
 444                 "State directory %s is not a directory\n" %(state_dir))
 445             sys.exit(1)
 446     except:
 447         try:
 448             os.mkdir(state_dir)
 449         except:
 450             sys.stderr.write( \
 451                 "State directory %s could not be created\n" %(state_dir))
 452             sys.exit(1)
 453
 454 if scp.has_option("general", "maildir_root"):
 455     maildir_root = scp.get("general", "maildir_root")
 456
 457 try:
 458     mode = os.stat(maildir_root)[stat.ST_MODE]
 459     if not stat.S_ISDIR(mode):
 460         sys.stderr.write( \
 461             "Maildir Root %s is not a directory\n" \
 462             %(maildir_root))
 463         sys.exit(1)
 464 except:
 465     try:
 466         os.mkdir(maildir_root)
 467     except:
 468         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 469         sys.exit(1)
 470
 471 feeds = scp.sections()
 472 try:
 473     feeds.remove("general")
 474 except:
 475     pass
 476
 477 for section in feeds:
 478     # check if the directory exists
 479     maildir = None
 480     try:
 481         maildir = scp.get(section, "maildir")
 482     except:
 483         maildir = section
 484
 485     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 486     maildir = os.path.join(maildir_root, maildir)
 487
 488     try:
 489         exists = os.stat(maildir)
 490         if stat.S_ISDIR(exists[stat.ST_MODE]):
 491             # check if there's a new, cur and tmp directory
 492             try:
 493                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 494             except:
 495                 os.mkdir(os.path.join(maildir, "cur"))
 496                 if not stat.S_ISDIR(mode):
 497                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 498             try:
 499                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 500             except:
 501                 os.mkdir(os.path.join(maildir, "tmp"))
 502                 if not stat.S_ISDIR(mode):
 503                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 504             try:
 505                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 506                 if not stat.S_ISDIR(mode):
 507                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 508             except:
 509                 os.mkdir(os.path.join(maildir, "new"))
 510         else:
 511             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 512     except:
 513         try:
 514             os.mkdir(maildir)
 515         except:
 516             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 517             sys.exit(1)
 518         try:
 519             os.mkdir(os.path.join(maildir, "new"))
 520             os.mkdir(os.path.join(maildir, "cur"))
 521             os.mkdir(os.path.join(maildir, "tmp"))
 522         except:
 523             sys.stderr.write( \
 524                 "Couldn't create required maildir directories for %s\n" \
 525                 %(section,))
 526             sys.exit(1)
 527
 528     # right - we've got the directories, we've got the section, we know the
 529     # url... lets play!
 530
 531     parse_and_deliver(maildir, section, state_dir)