rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             if self.inparagraph:
  92                 self.text = self.text \
  93                     + u'\n'.join( \
  94                         textwrap.wrap(self.currentparagraph, 70)) \
  95                     + u'\n'
  96                 self.currentparagraph = ""
  97             elif self.inblockquote:
  98                 self.text = self.text \
  99                     + u'\n> ' \
 100                     + u'\n> '.join( \
 101                         [a.strip() \
 102                             for a in textwrap.wrap(self.blockquote, 68) \
 103                         ]) \
 104                     + u'\n'
 105                 self.blockquote = u''
 106             else:
 107                 self.text = self.text + u'\n'
 108         elif tag.lower() == "blockquote":
 109             self.inblockquote = True
 110             self.text = self.text + u'\n'
 111         elif tag.lower() == "p":
 112             if self.text != "":
 113                 self.text = self.text + u'\n\n'
 114             if self.inparagraph:
 115                 self.text = self.text \
 116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 117             self.currentparagraph = u''
 118             self.inparagraph = True
 119         elif tag.lower() == "pre":
 120             self.text = self.text + "\n"
 121             self.inpre = True
 122             self.inparagraph = False
 123             self.inblockquote = False
 124         elif tag.lower() == "ul":
 125             self.item = u''
 126             self.inul = True
 127             self.text = self.text + "\n"
 128         elif tag.lower() == "li" and self.inul:
 129             if not self.initem:
 130                 self.initem = True
 131                 self.item = u''
 132             else:
 133                 self.text = self.text \
 134                     + u' * ' \
 135                     + u'\n   '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
 136                     + u'\n'
 137                 self.item = u''
 138
 139     def handle_startendtag(self, tag, attrs):
 140         if tag.lower() == "br":
 141             if self.inparagraph:
 142                 self.text = self.text \
 143                 + u'\n'.join( \
 144                     [a \
 145                         for a in textwrap.wrap( \
 146                             self.currentparagraph, 70) \
 147                     ] \
 148                 ) \
 149                 + u'\n'
 150                 self.currentparagraph = u''
 151             elif self.inblockquote:
 152                 self.text = self.text \
 153                     + u'\n> ' \
 154                     + u'\n> '.join( \
 155                         [a \
 156                             for a in textwrap.wrap( \
 157                                 self.blockquote.encode("utf-8") \
 158                                 , 68) \
 159                         ] \
 160                     ) \
 161                     + u'\n'
 162                 self.blockquote = u''
 163             else:
 164                 self.text = self.text + "\n"
 165
 166     def handle_endtag(self, tag):
 167         if tag.lower() == "h1":
 168             self.inheadingone = False
 169             self.text = self.text \
 170                 + u'\n\n' \
 171                 + self.headingtext.encode("utf-8") \
 172                 + u'\n' \
 173                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 174             self.headingtext = u''
 175         elif tag.lower() == "h2":
 176             self.inheadingtwo = False
 177             self.text = self.text \
 178                 + u'\n\n' \
 179                 + self.headingtext.encode("utf-8") \
 180                 + u'\n' \
 181                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 182             self.headingtext = u''
 183         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 184             self.inotherheading = False
 185             self.text = self.text \
 186                 + u'\n\n' \
 187                 + self.headingtext.encode("utf-8") \
 188                 + u'\n' \
 189                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 190             self.headingtext = u''
 191         elif tag.lower() == "p":
 192             self.text = self.text \
 193                 + u'\n'.join(textwrap.wrap( \
 194                     self.currentparagraph, 70) \
 195                 )
 196             self.inparagraph = False
 197             self.currentparagraph = u''
 198         elif tag.lower() == "blockquote":
 199             self.text = self.text \
 200                 + u'\n> ' \
 201                 + u'\n> '.join( \
 202                     [a.strip() for a in textwrap.wrap(self.blockquote, 68)] \
 203                     ).encode("utf-8") \
 204                 + u'\n'
 205             self.inblockquote = False
 206             self.blockquote = u''
 207         elif tag.lower() == "pre":
 208             self.inpre = False
 209         elif tag.lower() == "li":
 210             self.initem = False
 211             if self.item != "":
 212                 self.text = self.text \
 213                     + u' * ' \
 214                     + u'\n   '.join( \
 215                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 216                     + u'\n'
 217             self.item = u''
 218         elif tag.lower() == "ul":
 219             self.inul = False
 220
 221     def handle_data(self, data):
 222         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 223             self.headingtext = self.headingtext \
 224                 + unicode(data, "utf-8").strip() \
 225                 + u' '
 226         elif self.inblockquote:
 227             self.blockquote = self.blockquote \
 228                 + unicode(data, "utf-8").strip() \
 229                 + u' '
 230         elif self.inparagraph:
 231             self.currentparagraph = self.currentparagraph \
 232                 + unicode(data, "utf-8").strip() \
 233                 + u' '
 234         elif self.inul and self.initem:
 235             self.item = self.item + unicode(data, "utf-8")
 236         elif self.inpre:
 237             self.text = self.text + unicode(data, "utf-8")
 238         else:
 239             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 240
 241     def handle_entityref(self, name):
 242         entity = name
 243         if entities.has_key(name.lower()):
 244             entity = entities[name.lower()]
 245         elif name[0] == "#":
 246             entity = unichr(int(name[1:]))
 247         else:
 248             entity = "&" + name + ";"
 249
 250         if self.inparagraph:
 251             self.currentparagraph = self.currentparagraph + entity
 252         elif self.inblockquote:
 253             self.blockquote = self.blockquote + entity
 254         else:
 255             self.text = self.text + entity
 256
 257     def gettext(self):
 258         data = self.text
 259         if self.inparagraph:
 260             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 261         return data
 262
 263 def parse_and_deliver(maildir, url, statedir):
 264     fp = feedparser.parse(url)
 265     db = dbm.open(os.path.join(statedir, "seen"), "c")
 266     for item in fp["items"]:
 267         # have we seen it before?
 268         # need to work out what the content is first...
 269
 270         if item.has_key("content"):
 271             content = item["content"][0]["value"]
 272         else:
 273             content = item["summary"]
 274
 275         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 276
 277         if db.has_key(url + "|" + item["link"]):
 278             data = db[url + "|" + item["link"]]
 279             data = cgi.parse_qs(data)
 280             if data["contentmd5"][0] == md5sum:
 281                 continue
 282
 283         try:
 284             author = item["author"]
 285         except:
 286             author = url
 287
 288         # create a basic email message
 289         msg = MIMEMultipart("alternative")
 290         messageid = "<" \
 291             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 292             + "." \
 293             + "".join( \
 294                 [random.choice( \
 295                     string.ascii_letters + string.digits \
 296                     ) for a in range(0,6) \
 297                 ]) + "@" + socket.gethostname() + ">"
 298         msg.add_header("Message-ID", messageid)
 299         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 300         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 301         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 302         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 303             .strftime("%a, %e %b %Y %T -0000")
 304         msg.add_header("Date", createddate)
 305         msg.add_header("Subject", item["title"])
 306         msg.set_default_type("text/plain")
 307
 308         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 309         textparser = HTML2Text()
 310         textparser.feed(content.encode("utf-8"))
 311         textcontent = textparser.gettext()
 312         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 313         msg.attach(textpart)
 314         msg.attach(htmlpart)
 315
 316         # start by working out the filename we should be writting to, we do
 317         # this following the normal maildir style rules
 318         fname = str(os.getpid()) \
 319             + "." + socket.gethostname() \
 320             + "." + "".join( \
 321                 [random.choice( \
 322                     string.ascii_letters + string.digits \
 323                     ) for a in range(0,10) \
 324                 ]) + "." \
 325             + datetime.datetime.now().strftime('%s')
 326         fn = os.path.join(maildir, "tmp", fname)
 327         fh = open(fn, "w")
 328         fh.write(msg.as_string())
 329         fh.close()
 330         # now move it in to the new directory
 331         newfn = os.path.join(maildir, "new", fname)
 332         os.link(fn, newfn)
 333         os.unlink(fn)
 334
 335         # now add to the database about the item
 336         data = urllib.urlencode((
 337             ("message-id", messageid), \
 338             ("created", createddate), \
 339             ("contentmd5", md5sum) \
 340             ))
 341         db[url + "|" + item["link"]] = data
 342
 343     db.close()
 344
 345 # first off, parse the command line arguments
 346
 347 oparser = OptionParser()
 348 oparser.add_option(
 349     "-c", "--conf", dest="conf",
 350     help="location of config file"
 351     )
 352 oparser.add_option(
 353     "-s", "--statedir", dest="statedir",
 354     help="location of directory to store state in"
 355     )
 356
 357 (options, args) = oparser.parse_args()
 358
 359 # check for the configfile
 360
 361 configfile = None
 362
 363 if options.conf != None:
 364     # does the file exist?
 365     try:
 366         os.stat(options.conf)
 367         configfile = options.conf
 368     except:
 369         # should exit here as the specified file doesn't exist
 370         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
 371         sys.exit(2)
 372 else:
 373     # check through the default locations
 374     try:
 375         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 376         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 377     except:
 378         try:
 379             os.stat("/etc/rss2maildir.conf")
 380             configfile = "/etc/rss2maildir.conf"
 381         except:
 382             sys.stderr.write("No config file found. Exiting.\n")
 383             sys.exit(2)
 384
 385 # Right - if we've got this far, we've got a config file, now for the hard
 386 # bits...
 387
 388 scp = SafeConfigParser()
 389 scp.read(configfile)
 390
 391 maildir_root = "RSSMaildir"
 392 state_dir = "state"
 393
 394 if options.statedir != None:
 395     state_dir = options.statedir
 396     try:
 397         mode = os.stat(state_dir)[stat.ST_MODE]
 398         if not stat.S_ISDIR(mode):
 399             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 400             sys.exit(1)
 401     except:
 402         # try to make the directory
 403         try:
 404             os.mkdir(state_dir)
 405         except:
 406             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 407             sys.exit(1)
 408 elif scp.has_option("general", "state_dir"):
 409     new_state_dir = scp.get("general", "state_dir")
 410     try:
 411         mode = os.stat(state_dir)[stat.ST_MODE]
 412         if not stat.S_ISDIR(mode):
 413             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 414             sys.exit(1)
 415     except:
 416         # try to create it
 417         try:
 418             os.mkdir(new_state_dir)
 419             state_dir = new_state_dir
 420         except:
 421             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
 422             sys.exit(1)
 423 else:
 424     try:
 425         mode = os.stat(state_dir)[stat.ST_MODE]
 426         if not stat.S_ISDIR(mode):
 427             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
 428             sys.exit(1)
 429     except:
 430         try:
 431             os.mkdir(state_dir)
 432         except:
 433             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
 434             sys.exit(1)
 435
 436 if scp.has_option("general", "maildir_root"):
 437     maildir_root = scp.get("general", "maildir_root")
 438
 439 try:
 440     mode = os.stat(maildir_root)[stat.ST_MODE]
 441     if not stat.S_ISDIR(mode):
 442         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
 443         sys.exit(1)
 444 except:
 445     try:
 446         os.mkdir(maildir_root)
 447     except:
 448         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 449         sys.exit(1)
 450
 451 feeds = scp.sections()
 452 try:
 453     feeds.remove("general")
 454 except:
 455     pass
 456
 457 for section in feeds:
 458     # check if the directory exists
 459     maildir = None
 460     try:
 461         maildir = scp.get(section, "maildir")
 462     except:
 463         maildir = section
 464
 465     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 466     maildir = os.path.join(maildir_root, maildir)
 467
 468     try:
 469         exists = os.stat(maildir)
 470         if stat.S_ISDIR(exists[stat.ST_MODE]):
 471             # check if there's a new, cur and tmp directory
 472             try:
 473                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 474             except:
 475                 os.mkdir(os.path.join(maildir, "cur"))
 476                 if not stat.S_ISDIR(mode):
 477                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 478             try:
 479                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 480             except:
 481                 os.mkdir(os.path.join(maildir, "tmp"))
 482                 if not stat.S_ISDIR(mode):
 483                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 484             try:
 485                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 486                 if not stat.S_ISDIR(mode):
 487                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 488             except:
 489                 os.mkdir(os.path.join(maildir, "new"))
 490         else:
 491             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 492     except:
 493         try:
 494             os.mkdir(maildir)
 495         except:
 496             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 497             sys.exit(1)
 498         try:
 499             os.mkdir(os.path.join(maildir, "new"))
 500             os.mkdir(os.path.join(maildir, "cur"))
 501             os.mkdir(os.path.join(maildir, "tmp"))
 502         except:
 503             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
 504             sys.exit(1)
 505
 506     # right - we've got the directories, we've got the section, we know the
 507     # url... lets play!
 508
 509     parse_and_deliver(maildir, section, state_dir)