rss2maildir.py

   1 #!/usr/bin/python
   2 # coding=utf-8
   3
   4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
   5 # Copyright (C) 2007  Brett Parker <iDunno@sommitrealweird.co.uk>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import sys
  21 import os
  22 import stat
  23 import urllib
  24
  25 import feedparser
  26
  27 from email.MIMEMultipart import MIMEMultipart
  28 from email.MIMEText import MIMEText
  29
  30 import datetime
  31 import random
  32 import string
  33 import textwrap
  34
  35 import socket
  36
  37 from optparse import OptionParser
  38 from ConfigParser import SafeConfigParser
  39
  40 from base64 import b64encode
  41 import md5
  42
  43 import cgi
  44 import dbm
  45
  46 from HTMLParser import HTMLParser
  47
  48 entities = {
  49     "amp": "&",
  50     "lt": "<",
  51     "gt": ">",
  52     "pound": "£",
  53     "copy": "©",
  54     "apos": "'",
  55     "quote": "\"",
  56     "nbsp": " ",
  57     }
  58
  59 class HTML2Text(HTMLParser):
  60
  61     def __init__(self):
  62         self.inheadingone = False
  63         self.inheadingtwo = False
  64         self.inotherheading = False
  65         self.inparagraph = True
  66         self.inblockquote = False
  67         self.inlink = False
  68         self.text = u''
  69         self.currentparagraph = u''
  70         self.headingtext = u''
  71         self.blockquote = u''
  72         self.inpre = False
  73         self.inul = False
  74         self.initem = False
  75         self.item = u''
  76         HTMLParser.__init__(self)
  77
  78     def handle_starttag(self, tag, attrs):
  79         if tag.lower() == "h1":
  80             self.inheadingone = True
  81             self.inparagraph = False
  82         elif tag.lower() == "h2":
  83             self.inheadingtwo = True
  84             self.inparagraph = False
  85         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
  86             self.inotherheading = True
  87             self.inparagraph = False
  88         elif tag.lower() == "a":
  89             self.inlink = True
  90         elif tag.lower() == "br":
  91             if self.inparagraph:
  92                 self.text = self.text \
  93                     + u'\n'.join( \
  94                         textwrap.wrap(self.currentparagraph, 70)) \
  95                     + u'\n'
  96                 self.currentparagraph = ""
  97             elif self.inblockquote:
  98                 self.text = self.text \
  99                     + u'\n> ' \
 100                     + u'\n> '.join( \
 101                         [a.strip() \
 102                             for a in textwrap.wrap(self.blockquote, 68) \
 103                         ]) \
 104                     + u'\n'
 105                 self.blockquote = u''
 106             else:
 107                 self.text = self.text + u'\n'
 108         elif tag.lower() == "blockquote":
 109             self.inblockquote = True
 110             self.text = self.text + u'\n'
 111         elif tag.lower() == "p":
 112             if self.text != "":
 113                 self.text = self.text + u'\n\n'
 114             if self.inparagraph:
 115                 self.text = self.text \
 116                     + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
 117             self.currentparagraph = u''
 118             self.inparagraph = True
 119         elif tag.lower() == "pre":
 120             self.text = self.text + "\n"
 121             self.inpre = True
 122             self.inparagraph = False
 123             self.inblockquote = False
 124         elif tag.lower() == "ul":
 125             self.item = u''
 126             self.inul = True
 127             self.text = self.text + "\n"
 128         elif tag.lower() == "li" and self.inul:
 129             if not self.initem:
 130                 self.initem = True
 131                 self.item = u''
 132             else:
 133                 self.text = self.text \
 134                     + u' * ' \
 135                     + u'\n   '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
 136                     + u'\n'
 137                 self.item = u''
 138
 139     def handle_startendtag(self, tag, attrs):
 140         if tag.lower() == "br":
 141             if self.inparagraph:
 142                 self.text = self.text \
 143                 + u'\n'.join( \
 144                     [a \
 145                         for a in textwrap.wrap( \
 146                             self.currentparagraph, 70) \
 147                     ] \
 148                 ) \
 149                 + u'\n'
 150                 self.currentparagraph = u''
 151             elif self.inblockquote:
 152                 self.text = self.text \
 153                     + u'\n> ' \
 154                     + u'\n> '.join( \
 155                         [a \
 156                             for a in textwrap.wrap( \
 157                                 self.blockquote.encode("utf-8") \
 158                                 , 68) \
 159                         ] \
 160                     ) \
 161                     + u'\n'
 162                 self.blockquote = u''
 163             else:
 164                 self.text = self.text + "\n"
 165
 166     def handle_endtag(self, tag):
 167         if tag.lower() == "h1":
 168             self.inheadingone = False
 169             self.text = self.text \
 170                 + u'\n\n' \
 171                 + self.headingtext.encode("utf-8") \
 172                 + u'\n' \
 173                 + u'=' * len(self.headingtext.encode("utf-8").strip())
 174             self.headingtext = u''
 175         elif tag.lower() == "h2":
 176             self.inheadingtwo = False
 177             self.text = self.text \
 178                 + u'\n\n' \
 179                 + self.headingtext.encode("utf-8") \
 180                 + u'\n' \
 181                 + u'-' * len(self.headingtext.encode("utf-8").strip())
 182             self.headingtext = u''
 183         elif tag.lower() in ["h3", "h4", "h5", "h6"]:
 184             self.inotherheading = False
 185             self.text = self.text \
 186                 + u'\n\n' \
 187                 + self.headingtext.encode("utf-8") \
 188                 + u'\n' \
 189                 + u'~' * len(self.headingtext.encode("utf-8").strip())
 190             self.headingtext = u''
 191         elif tag.lower() == "p":
 192             self.text = self.text \
 193                 + u'\n'.join(textwrap.wrap( \
 194                     self.currentparagraph, 70) \
 195                 )
 196             self.inparagraph = False
 197             self.currentparagraph = u''
 198         elif tag.lower() == "blockquote":
 199             self.text = self.text \
 200                 + u'\n> ' \
 201                 + u'\n> '.join( \
 202                     [a.strip() \
 203                         for a in textwrap.wrap( \
 204                             self.blockquote, 68)] \
 205                     ) \
 206                 + u'\n'
 207             self.inblockquote = False
 208             self.blockquote = u''
 209         elif tag.lower() == "pre":
 210             self.inpre = False
 211         elif tag.lower() == "li":
 212             self.initem = False
 213             if self.item != "":
 214                 self.text = self.text \
 215                     + u' * ' \
 216                     + u'\n   '.join( \
 217                         [a.strip() for a in textwrap.wrap(self.item, 67)]) \
 218                     + u'\n'
 219             self.item = u''
 220         elif tag.lower() == "ul":
 221             self.inul = False
 222
 223     def handle_data(self, data):
 224         if self.inheadingone or self.inheadingtwo or self.inotherheading:
 225             self.headingtext = self.headingtext \
 226                 + unicode(data, "utf-8").strip() \
 227                 + u' '
 228         elif self.inblockquote:
 229             self.blockquote = self.blockquote \
 230                 + unicode(data, "utf-8").strip() \
 231                 + u' '
 232         elif self.inparagraph:
 233             self.currentparagraph = self.currentparagraph \
 234                 + unicode(data, "utf-8").strip() \
 235                 + u' '
 236         elif self.inul and self.initem:
 237             self.item = self.item + unicode(data, "utf-8")
 238         elif self.inpre:
 239             self.text = self.text + unicode(data, "utf-8")
 240         else:
 241             self.text = self.text + unicode(data, "utf-8").strip() + u' '
 242
 243     def handle_entityref(self, name):
 244         entity = name
 245         if entities.has_key(name.lower()):
 246             entity = entities[name.lower()]
 247         elif name[0] == "#":
 248             entity = unichr(int(name[1:]))
 249         else:
 250             entity = "&" + name + ";"
 251
 252         if self.inparagraph:
 253             self.currentparagraph = self.currentparagraph + unicode(entity, "utf-8")
 254         elif self.inblockquote:
 255             self.blockquote = self.blockquote + unicode(entity, "utf-8")
 256         else:
 257             self.text = self.text + unicode(entity, "utf-8")
 258
 259     def gettext(self):
 260         data = self.text
 261         if self.inparagraph:
 262             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
 263         return data
 264
 265 def parse_and_deliver(maildir, url, statedir):
 266     fp = feedparser.parse(url)
 267     db = dbm.open(os.path.join(statedir, "seen"), "c")
 268     for item in fp["items"]:
 269         # have we seen it before?
 270         # need to work out what the content is first...
 271
 272         if item.has_key("content"):
 273             content = item["content"][0]["value"]
 274         else:
 275             content = item["summary"]
 276
 277         md5sum = md5.md5(content.encode("utf-8")).hexdigest()
 278
 279         if db.has_key(url + "|" + item["link"]):
 280             data = db[url + "|" + item["link"]]
 281             data = cgi.parse_qs(data)
 282             if data["contentmd5"][0] == md5sum:
 283                 continue
 284
 285         try:
 286             author = item["author"]
 287         except:
 288             author = url
 289
 290         # create a basic email message
 291         msg = MIMEMultipart("alternative")
 292         messageid = "<" \
 293             + datetime.datetime.now().strftime("%Y%m%d%H%M") \
 294             + "." \
 295             + "".join( \
 296                 [random.choice( \
 297                     string.ascii_letters + string.digits \
 298                     ) for a in range(0,6) \
 299                 ]) + "@" + socket.gethostname() + ">"
 300         msg.add_header("Message-ID", messageid)
 301         msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
 302         msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
 303         msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
 304         createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
 305             .strftime("%a, %e %b %Y %T -0000")
 306         msg.add_header("Date", createddate)
 307         msg.add_header("Subject", item["title"])
 308         msg.set_default_type("text/plain")
 309
 310         htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
 311         textparser = HTML2Text()
 312         textparser.feed(content.encode("utf-8"))
 313         textcontent = textparser.gettext()
 314         textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
 315         msg.attach(textpart)
 316         msg.attach(htmlpart)
 317
 318         # start by working out the filename we should be writting to, we do
 319         # this following the normal maildir style rules
 320         fname = str(os.getpid()) \
 321             + "." + socket.gethostname() \
 322             + "." + "".join( \
 323                 [random.choice( \
 324                     string.ascii_letters + string.digits \
 325                     ) for a in range(0,10) \
 326                 ]) + "." \
 327             + datetime.datetime.now().strftime('%s')
 328         fn = os.path.join(maildir, "tmp", fname)
 329         fh = open(fn, "w")
 330         fh.write(msg.as_string())
 331         fh.close()
 332         # now move it in to the new directory
 333         newfn = os.path.join(maildir, "new", fname)
 334         os.link(fn, newfn)
 335         os.unlink(fn)
 336
 337         # now add to the database about the item
 338         data = urllib.urlencode((
 339             ("message-id", messageid), \
 340             ("created", createddate), \
 341             ("contentmd5", md5sum) \
 342             ))
 343         db[url + "|" + item["link"]] = data
 344
 345     db.close()
 346
 347 # first off, parse the command line arguments
 348
 349 oparser = OptionParser()
 350 oparser.add_option(
 351     "-c", "--conf", dest="conf",
 352     help="location of config file"
 353     )
 354 oparser.add_option(
 355     "-s", "--statedir", dest="statedir",
 356     help="location of directory to store state in"
 357     )
 358
 359 (options, args) = oparser.parse_args()
 360
 361 # check for the configfile
 362
 363 configfile = None
 364
 365 if options.conf != None:
 366     # does the file exist?
 367     try:
 368         os.stat(options.conf)
 369         configfile = options.conf
 370     except:
 371         # should exit here as the specified file doesn't exist
 372         sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
 373         sys.exit(2)
 374 else:
 375     # check through the default locations
 376     try:
 377         os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
 378         configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
 379     except:
 380         try:
 381             os.stat("/etc/rss2maildir.conf")
 382             configfile = "/etc/rss2maildir.conf"
 383         except:
 384             sys.stderr.write("No config file found. Exiting.\n")
 385             sys.exit(2)
 386
 387 # Right - if we've got this far, we've got a config file, now for the hard
 388 # bits...
 389
 390 scp = SafeConfigParser()
 391 scp.read(configfile)
 392
 393 maildir_root = "RSSMaildir"
 394 state_dir = "state"
 395
 396 if options.statedir != None:
 397     state_dir = options.statedir
 398     try:
 399         mode = os.stat(state_dir)[stat.ST_MODE]
 400         if not stat.S_ISDIR(mode):
 401             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 402             sys.exit(1)
 403     except:
 404         # try to make the directory
 405         try:
 406             os.mkdir(state_dir)
 407         except:
 408             sys.stderr.write("Couldn't create statedir %s" %(state_dir))
 409             sys.exit(1)
 410 elif scp.has_option("general", "state_dir"):
 411     new_state_dir = scp.get("general", "state_dir")
 412     try:
 413         mode = os.stat(state_dir)[stat.ST_MODE]
 414         if not stat.S_ISDIR(mode):
 415             sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
 416             sys.exit(1)
 417     except:
 418         # try to create it
 419         try:
 420             os.mkdir(new_state_dir)
 421             state_dir = new_state_dir
 422         except:
 423             sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
 424             sys.exit(1)
 425 else:
 426     try:
 427         mode = os.stat(state_dir)[stat.ST_MODE]
 428         if not stat.S_ISDIR(mode):
 429             sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
 430             sys.exit(1)
 431     except:
 432         try:
 433             os.mkdir(state_dir)
 434         except:
 435             sys.stderr.write("State directory %s could not be created\n" %(state_dir))
 436             sys.exit(1)
 437
 438 if scp.has_option("general", "maildir_root"):
 439     maildir_root = scp.get("general", "maildir_root")
 440
 441 try:
 442     mode = os.stat(maildir_root)[stat.ST_MODE]
 443     if not stat.S_ISDIR(mode):
 444         sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
 445         sys.exit(1)
 446 except:
 447     try:
 448         os.mkdir(maildir_root)
 449     except:
 450         sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
 451         sys.exit(1)
 452
 453 feeds = scp.sections()
 454 try:
 455     feeds.remove("general")
 456 except:
 457     pass
 458
 459 for section in feeds:
 460     # check if the directory exists
 461     maildir = None
 462     try:
 463         maildir = scp.get(section, "maildir")
 464     except:
 465         maildir = section
 466
 467     maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
 468     maildir = os.path.join(maildir_root, maildir)
 469
 470     try:
 471         exists = os.stat(maildir)
 472         if stat.S_ISDIR(exists[stat.ST_MODE]):
 473             # check if there's a new, cur and tmp directory
 474             try:
 475                 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
 476             except:
 477                 os.mkdir(os.path.join(maildir, "cur"))
 478                 if not stat.S_ISDIR(mode):
 479                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 480             try:
 481                 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
 482             except:
 483                 os.mkdir(os.path.join(maildir, "tmp"))
 484                 if not stat.S_ISDIR(mode):
 485                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 486             try:
 487                 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
 488                 if not stat.S_ISDIR(mode):
 489                     sys.stderr.write("Broken maildir: %s\n" %(maildir))
 490             except:
 491                 os.mkdir(os.path.join(maildir, "new"))
 492         else:
 493             sys.stderr.write("Broken maildir: %s\n" %(maildir))
 494     except:
 495         try:
 496             os.mkdir(maildir)
 497         except:
 498             sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
 499             sys.exit(1)
 500         try:
 501             os.mkdir(os.path.join(maildir, "new"))
 502             os.mkdir(os.path.join(maildir, "cur"))
 503             os.mkdir(os.path.join(maildir, "tmp"))
 504         except:
 505             sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
 506             sys.exit(1)
 507
 508     # right - we've got the directories, we've got the section, we know the
 509     # url... lets play!
 510
 511     parse_and_deliver(maildir, section, state_dir)