rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             if self.inparagraph:
  92                 self.text = self.text \
  93                     + u'\n'.join( \
  94                         textwrap.wrap(self.currentparagraph, 70)) \
  95                     + u'\n'
  96                 self.currentparagraph = ""
  97             elif self.inblockquote:
  98                 self.text = self.text \
  99                     + u'\n> ' \
 100                     + u'\n> '.join( \
 101                         [a.strip() \
 102                             for a in textwrap.wrap(self.blockquote, 68) \
 103                         ]) \
 104                     + u'\n'
 105                 self.blockquote = u''
 106             else:
 107                 self.text = self.text + u'\n'
 108         elif tag.lower() == "blockquote":
 109             self.inblockquote = True
 110             self.text = self.text + u'\n'
 111         elif tag.lower() == "p":
 112             if self.text != "":
 113                 self.text = self.text + u'\n\n'
 114             if self.inparagraph:
 115                 self.text = self.text \
 116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 117             self.currentparagraph = u''
 118             self.inparagraph = True
 119         elif tag.lower() == "pre":
 120             self.text = self.text + "\n"
 121             self.inpre = True
 122             self.inparagraph = False
 123             self.inblockquote = False
 124         elif tag.lower() == "ul":
 125             self.item = u''
 126             self.inul = True
 127             self.text = self.text + "\n"
 128         elif tag.lower() == "li" and self.inul:
 129             if not self.initem:
 130                 self.initem = True
 131                 self.item = u''
 132             else:
 133                 self.text = self.text \
 134                     + u' * ' \
 135                     + u'\n   '.join([a.strip() for a in \
 136                         textwrap.wrap(self.item, 67)]) \
 137                     + u'\n'
 138                 self.item = u''
 139
 140     def handle_startendtag(self, tag, attrs):
 141         if tag.lower() == "br":
 142             if self.inparagraph:
 143                 self.text = self.text \
 144                 + u'\n'.join( \
 145                     [a \
 146                         for a in textwrap.wrap( \
 147                             self.currentparagraph, 70) \
 148                     ] \
 149                 ) \
 150                 + u'\n'
 151                 self.currentparagraph = u''
 152             elif self.inblockquote:
 153                 self.text = self.text \
 154                     + u'\n> ' \
 155                     + u'\n> '.join( \
 156                         [a \
 157                             for a in textwrap.wrap( \
 158                                 self.blockquote.encode("utf-8") \
 159                                 , 68) \
 160                         ] \
 161                     ) \
 162                     + u'\n'
 163                 self.blockquote = u''
 164             else:
 165                 self.text = self.text + "\n"
 166
 167     def handle_endtag(self, tag):
 168         if tag.lower() == "h1":
 169             self.inheadingone = False
 170             self.text = self.text \
 171                 + u'\n\n' \
 172                 + self.headingtext.encode("utf-8") \
 173                 + u'\n' \
 174                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 175             self.headingtext = u''
 176         elif tag.lower() == "h2":
 177             self.inheadingtwo = False
 178             self.text = self.text \
 179                 + u'\n\n' \
 180                 + self.headingtext.encode("utf-8") \
 181                 + u'\n' \
 182                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 183             self.headingtext = u''
 184         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 185             self.inotherheading = False
 186             self.text = self.text \
 187                 + u'\n\n' \
 188                 + self.headingtext.encode("utf-8") \
 189                 + u'\n' \
 190                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 191             self.headingtext = u''
 192         elif tag.lower() == "p":
 193             self.text = self.text \
 194                 + u'\n'.join(textwrap.wrap( \
 195                     self.currentparagraph, 70) \
 196                 )
 197             self.inparagraph = False
 198             self.currentparagraph = u''
 199         elif tag.lower() == "blockquote":
 200             self.text = self.text \
 201                 + u'\n> ' \
 202                 + u'\n> '.join( \
 203                     [a.strip() \
 204                         for a in textwrap.wrap( \
 205                             self.blockquote, 68)] \
 206                     ) \
 207                 + u'\n'
 208             self.inblockquote = False
 209             self.blockquote = u''
 210         elif tag.lower() == "pre":
 211             self.inpre = False
 212         elif tag.lower() == "li":
 213             self.initem = False
 214             if self.item != "":
 215                 self.text = self.text \
 216                     + u' * ' \
 217                     + u'\n   '.join( \
 218                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 219                     + u'\n'
 220             self.item = u''
 221         elif tag.lower() == "ul":
 222             self.inul = False
 223
 224     def handle_data(self, data):
 225         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 226             self.headingtext = self.headingtext \
 227                 + unicode(data, "utf-8").strip() \
 228                 + u' '
 229         elif self.inblockquote:
 230             self.blockquote = self.blockquote \
 231                 + unicode(data, "utf-8").strip() \
 232                 + u' '
 233         elif self.inparagraph:
 234             self.currentparagraph = self.currentparagraph \
 235                 + unicode(data, "utf-8").strip() \
 236                 + u' '
 237         elif self.inul and self.initem:
 238             self.item = self.item + unicode(data, "utf-8")
 239         elif self.inpre:
 240             self.text = self.text + unicode(data, "utf-8")
 241         else:
 242             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 243
 244     def handle_entityref(self, name):
 245         entity = name
 246         if entities.has_key(name.lower()):
 247             entity = entities[name.lower()]
 248         elif name[0] == "#":
 249             entity = unichr(int(name[1:]))
 250         else:
 251             entity = "&" + name + ";"
 252
 253         if self.inparagraph:
 254             self.currentparagraph = self.currentparagraph \
 255                 + unicode(entity, "utf-8")
 256         elif self.inblockquote:
 257             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 258         else:
 259             self.text = self.text + unicode(entity, "utf-8")
 260
 261     def gettext(self):
 262         data = self.text
 263         if self.inparagraph:
 264             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 265         return data
 266
 267 def parse_and_deliver(maildir, url, statedir):
 268     fp = feedparser.parse(url)
 269     db = dbm.open(os.path.join(statedir, "seen"), "c")
 270     for item in fp["items"]:
 271         # have we seen it before?
 272         # need to work out what the content is first...
 273
 274         if item.has_key("content"):
 275             content = item["content"][0]["value"]
 276         else:
 277             content = item["summary"]
 278
 279         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 280
 281         if db.has_key(url + "|" + item["link"]):
 282             data = db[url + "|" + item["link"]]
 283             data = cgi.parse_qs(data)
 284             if data["contentmd5"][0] == md5sum:
 285                 continue
 286
 287         try:
 288             author = item["author"]
 289         except:
 290             author = url
 291
 292         # create a basic email message
 293         msg = MIMEMultipart("alternative")
 294         messageid = "<" \
 295             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 296             + "." \
 297             + "".join( \
 298                 [random.choice( \
 299                     string.ascii_letters + string.digits \
 300                     ) for a in range(0,6) \
 301                 ]) + "@" + socket.gethostname() + ">"
 302         msg.add_header("Message-ID", messageid)
 303         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 304         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 305         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 306         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 307             .strftime("%a, %e %b %Y %T -0000")
 308         msg.add_header("Date", createddate)
 309         msg.add_header("Subject", item["title"])
 310         msg.set_default_type("text/plain")
 311
 312         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 313         textparser = HTML2Text()
 314         textparser.feed(content.encode("utf-8"))
 315         textcontent = textparser.gettext()
 316         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 317         msg.attach(textpart)
 318         msg.attach(htmlpart)
 319
 320         # start by working out the filename we should be writting to, we do
 321         # this following the normal maildir style rules
 322         fname = str(os.getpid()) \
 323             + "." + socket.gethostname() \
 324             + "." + "".join( \
 325                 [random.choice( \
 326                     string.ascii_letters + string.digits \
 327                     ) for a in range(0,10) \
 328                 ]) + "." \
 329             + datetime.datetime.now().strftime('%s')
 330         fn = os.path.join(maildir, "tmp", fname)
 331         fh = open(fn, "w")
 332         fh.write(msg.as_string())
 333         fh.close()
 334         # now move it in to the new directory
 335         newfn = os.path.join(maildir, "new", fname)
 336         os.link(fn, newfn)
 337         os.unlink(fn)
 338
 339         # now add to the database about the item
 340         data = urllib.urlencode((
 341             ("message-id", messageid), \
 342             ("created", createddate), \
 343             ("contentmd5", md5sum) \
 344             ))
 345         db[url + "|" + item["link"]] = data
 346
 347     db.close()
 348
 349 # first off, parse the command line arguments
 350
 351 oparser = OptionParser()
 352 oparser.add_option(
 353     "-c", "--conf", dest="conf",
 354     help="location of config file"
 355     )
 356 oparser.add_option(
 357     "-s", "--statedir", dest="statedir",
 358     help="location of directory to store state in"
 359     )
 360
 361 (options, args) = oparser.parse_args()
 362
 363 # check for the configfile
 364
 365 configfile = None
 366
 367 if options.conf != None:
 368     # does the file exist?
 369     try:
 370         os.stat(options.conf)
 371         configfile = options.conf
 372     except:
 373         # should exit here as the specified file doesn't exist
 374         sys.stderr.write( \
 375             "Config file %s does not exist. Exiting.\n" %(options.conf,))
 376         sys.exit(2)
 377 else:
 378     # check through the default locations
 379     try:
 380         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 381         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 382     except:
 383         try:
 384             os.stat("/etc/rss2maildir.conf")
 385             configfile = "/etc/rss2maildir.conf"
 386         except:
 387             sys.stderr.write("No config file found. Exiting.\n")
 388             sys.exit(2)
 389
 390 # Right - if we've got this far, we've got a config file, now for the hard
 391 # bits...
 392
 393 scp = SafeConfigParser()
 394 scp.read(configfile)
 395
 396 maildir_root = "RSSMaildir"
 397 state_dir = "state"
 398
 399 if options.statedir != None:
 400     state_dir = options.statedir
 401     try:
 402         mode = os.stat(state_dir)[stat.ST_MODE]
 403         if not stat.S_ISDIR(mode):
 404             sys.stderr.write( \
 405                 "State directory (%s) is not a directory\n" %(state_dir))
 406             sys.exit(1)
 407     except:
 408         # try to make the directory
 409         try:
 410             os.mkdir(state_dir)
 411         except:
 412             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 413             sys.exit(1)
 414 elif scp.has_option("general", "state_dir"):
 415     new_state_dir = scp.get("general", "state_dir")
 416     try:
 417         mode = os.stat(state_dir)[stat.ST_MODE]
 418         if not stat.S_ISDIR(mode):
 419             sys.stderr.write( \
 420                 "State directory (%s) is not a directory\n" %(state_dir))
 421             sys.exit(1)
 422     except:
 423         # try to create it
 424         try:
 425             os.mkdir(new_state_dir)
 426             state_dir = new_state_dir
 427         except:
 428             sys.stderr.write( \
 429                 "Couldn't create state directory %s\n" %(new_state_dir))
 430             sys.exit(1)
 431 else:
 432     try:
 433         mode = os.stat(state_dir)[stat.ST_MODE]
 434         if not stat.S_ISDIR(mode):
 435             sys.stderr.write( \
 436                 "State directory %s is not a directory\n" %(state_dir))
 437             sys.exit(1)
 438     except:
 439         try:
 440             os.mkdir(state_dir)
 441         except:
 442             sys.stderr.write( \
 443                 "State directory %s could not be created\n" %(state_dir))
 444             sys.exit(1)
 445
 446 if scp.has_option("general", "maildir_root"):
 447     maildir_root = scp.get("general", "maildir_root")
 448
 449 try:
 450     mode = os.stat(maildir_root)[stat.ST_MODE]
 451     if not stat.S_ISDIR(mode):
 452         sys.stderr.write( \
 453             "Maildir Root %s is not a directory\n" \
 454             %(maildir_root))
 455         sys.exit(1)
 456 except:
 457     try:
 458         os.mkdir(maildir_root)
 459     except:
 460         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 461         sys.exit(1)
 462
 463 feeds = scp.sections()
 464 try:
 465     feeds.remove("general")
 466 except:
 467     pass
 468
 469 for section in feeds:
 470     # check if the directory exists
 471     maildir = None
 472     try:
 473         maildir = scp.get(section, "maildir")
 474     except:
 475         maildir = section
 476
 477     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 478     maildir = os.path.join(maildir_root, maildir)
 479
 480     try:
 481         exists = os.stat(maildir)
 482         if stat.S_ISDIR(exists[stat.ST_MODE]):
 483             # check if there's a new, cur and tmp directory
 484             try:
 485                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 486             except:
 487                 os.mkdir(os.path.join(maildir, "cur"))
 488                 if not stat.S_ISDIR(mode):
 489                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 490             try:
 491                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 492             except:
 493                 os.mkdir(os.path.join(maildir, "tmp"))
 494                 if not stat.S_ISDIR(mode):
 495                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 496             try:
 497                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 498                 if not stat.S_ISDIR(mode):
 499                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 500             except:
 501                 os.mkdir(os.path.join(maildir, "new"))
 502         else:
 503             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 504     except:
 505         try:
 506             os.mkdir(maildir)
 507         except:
 508             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 509             sys.exit(1)
 510         try:
 511             os.mkdir(os.path.join(maildir, "new"))
 512             os.mkdir(os.path.join(maildir, "cur"))
 513             os.mkdir(os.path.join(maildir, "tmp"))
 514         except:
 515             sys.stderr.write( \
 516                 "Couldn't create required maildir directories for %s\n" \
 517                 %(section,))
 518             sys.exit(1)
 519
 520     # right - we've got the directories, we've got the section, we know the
 521     # url... lets play!
 522
 523     parse_and_deliver(maildir, section, state_dir)