4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
49 from HTMLParser import HTMLParser
51 class HTML2Text(HTMLParser):
209 def __init__(self,textwidth=70):
212 self.textwidth = textwidth
215 self.ignorenodata = False
219 HTMLParser.__init__(self)
221 def handle_starttag(self, tag, attrs):
222 tag_name = tag.lower()
223 if tag_name in self.blockleveltags:
224 # handle starting a new block - unless we're in a block element
225 # that can contain other blocks, we'll assume that we want to close
227 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
228 self.handle_curdata()
230 if tag_name == u'ol':
231 self.handle_curdata()
232 self.listcount.append(1)
233 self.listlevel = len(self.listcount) - 1
235 if tag_name == u'dl':
236 self.indentlevel = self.indentlevel + 4
238 if tag_name in self.liststarttags:
239 smallist = self.opentags[-3:-1]
241 for prev_listtag in smallist:
242 if prev_listtag in [u'dl', u'ol']:
243 self.indentlevel = self.indentlevel + 4
245 elif prev_listtag == u'ul':
246 self.indentlevel = self.indentlevel + 3
249 if len(self.opentags) > 0:
250 self.handle_curdata()
251 if tag_name not in self.cancontainflow:
253 self.opentags.append(tag_name)
255 if tag_name == "span":
259 listcount = self.listcount[-1]
263 if tag_name == u'dd' and len(self.opentags) > 1 \
264 and self.opentags[-1] == u'dt':
265 self.handle_curdata()
267 elif tag_name == u'dt' and len(self.opentags) > 1 \
268 and self.opentags[-1] == u'dd':
269 self.handle_curdata()
271 elif tag_name == u'a':
273 if attr[0].lower() == u'href':
274 self.urls.append(attr[1].decode('utf-8'))
275 self.curdata = self.curdata + u'`'
276 self.opentags.append(tag_name)
278 elif tag_name == u'img':
279 self.handle_image(attrs)
281 elif tag_name == u'br':
285 # we don't know the tag, so lets avoid handling it!
288 def handle_startendtag(self, tag, attrs):
289 if tag.lower() == u'br':
291 elif tag.lower() == u'img':
292 self.handle_image(attrs)
296 self.handle_curdata()
297 self.opentags.append(u'br')
298 self.handle_curdata()
301 def handle_image(self, attrs):
306 alt = attr[1].decode('utf-8')
307 elif attr[0] == 'src':
308 url = attr[1].decode('utf-8')
311 if self.images.has_key(alt):
312 if self.images[alt]["url"] == url:
313 self.curdata = self.curdata \
316 while self.images.has_key(alt):
318 self.images[alt] = {"url": url}
319 self.curdata = self.curdata \
322 self.images[alt] = {"url": url}
323 self.curdata = self.curdata \
326 if self.images.has_key(url):
327 self.curdata = self.curdata \
330 self.images[url] = {}
331 self.images[url]["url"] =url
332 self.curdata = self.curdata \
335 def handle_curdata(self):
337 if len(self.opentags) == 0:
340 tag_thats_done = self.opentags[-1]
342 if len(self.curdata) == 0:
345 if tag_thats_done == u'br':
346 if len(self.text) == 0 or self.text[-1] != '\n':
347 self.text = self.text + '\n'
348 self.ignorenodata = True
351 if len(self.curdata.strip()) == 0:
354 if tag_thats_done in self.blockleveltags:
355 newlinerequired = self.text != u''
356 if self.ignorenodata:
357 newlinerequired = False
358 self.ignorenodata = False
360 if tag_thats_done in [u'dt', u'dd', u'li'] \
361 and len(self.text) > 1 \
362 and self.text[-1] != u'\n':
363 self.text = self.text + u'\n'
364 elif len(self.text) > 2 \
365 and self.text[-1] != u'\n' \
366 and self.text[-2] != u'\n':
367 self.text = self.text + u'\n\n'
369 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
372 headingtext = " ".join(self.curdata.split())
373 seperator = u'\n' + u' '*self.indentlevel
374 headingtext = seperator.join( \
377 self.textwidth - self.indentlevel \
381 if tag_thats_done == u'h2':
383 elif tag_thats_done != u'h1':
386 if u'\n' in headingtext:
387 underline = u' ' * self.indentlevel \
388 + underlinechar * (self.textwidth - self.indentlevel)
390 underline = u' ' * self.indentlevel \
391 + underlinechar * len(headingtext)
392 self.text = self.text \
393 + headingtext + u'\n' \
395 elif tag_thats_done in [u'p', u'div']:
396 paragraph = unicode( \
397 " ".join(self.curdata.strip().encode("utf-8").split()), \
399 seperator = u'\n' + u' ' * self.indentlevel
400 self.text = self.text \
401 + u' ' * self.indentlevel \
404 paragraph, self.textwidth - self.indentlevel))
405 elif tag_thats_done == "pre":
406 self.text = self.text + unicode( \
407 self.curdata.encode("utf-8"), "utf-8")
408 elif tag_thats_done == u'blockquote':
410 " ".join(self.curdata.encode("utf-8").strip().split()), \
412 seperator = u'\n' + u' ' * self.indentlevel + u' '
413 if len(self.text) > 0 and self.text[-1] != u'\n':
414 self.text = self.text + u'\n'
415 self.text = self.text \
420 self.textwidth - self.indentlevel - 2 \
424 elif tag_thats_done == "li":
425 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
426 if len(self.text) > 0 and self.text[-1] != u'\n':
427 self.text = self.text + u'\n'
428 # work out if we're in an ol rather than a ul
429 latesttags = self.opentags[-4:]
432 for thing in latesttags:
446 listmarker = u' %2d. ' %(self.listcount[-1])
447 self.listcount[-1] = self.listcount[-1] + 1
450 + u' ' * self.indentlevel \
452 self.text = self.text \
453 + u' ' * self.indentlevel \
458 self.textwidth - self.indentlevel - listindent \
462 elif tag_thats_done == u'dt':
463 definition = unicode(" ".join( \
464 self.curdata.encode("utf-8").strip().split()), \
466 if len(self.text) > 0 and self.text[-1] != u'\n':
467 self.text = self.text + u'\n\n'
468 elif len(self.text) > 1 and self.text[-2] != u'\n':
469 self.text = self.text + u'\n'
470 definition = u' ' * (self.indentlevel - 4) + definition + "::"
471 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
472 self.text = self.text \
474 textwrap.wrap(definition, \
475 self.textwidth - self.indentlevel - 4))
477 elif tag_thats_done == u'dd':
478 definition = unicode(" ".join( \
479 self.curdata.encode("utf-8").strip().split()),
481 if len(definition) > 0:
482 if len(self.text) > 0 and self.text[-1] != u'\n':
483 self.text = self.text + u'\n'
484 indentstring = u'\n' + u' ' * self.indentlevel
485 self.text = self.text \
487 + indentstring.join( \
490 self.textwidth - self.indentlevel \
494 elif tag_thats_done == u'a':
495 self.curdata = self.curdata + u'`__'
497 elif tag_thats_done in self.liststarttags:
500 if tag_thats_done in self.blockleveltags:
503 self.ignorenodata = False
505 def handle_endtag(self, tag):
506 self.ignorenodata = False
511 tagindex = self.opentags.index(tag)
516 if tag in [u'br', u'img']:
520 self.indentlevel = self.indentlevel - 4
522 if tag in self.liststarttags:
523 if tag in [u'ol', u'dl', u'ul', u'dd']:
524 self.handle_curdata()
525 # find if there was a previous list level
526 smalllist = self.opentags[:-1]
528 for prev_listtag in smalllist:
529 if prev_listtag in [u'ol', u'dl']:
530 self.indentlevel = self.indentlevel - 4
532 elif prev_listtag == u'ul':
533 self.indentlevel = self.indentlevel - 3
537 self.listcount = self.listcount[:-1]
539 while tagindex < len(self.opentags) \
540 and tag in self.opentags[tagindex+1:]:
542 tagindex = self.opentags.index(tag, tagindex+1)
544 # well, we don't want to do that then
546 if tagindex != len(self.opentags) - 1:
547 # Assuming the data was for the last opened tag first
548 self.handle_curdata()
549 # Now kill the list to be a slice before this tag was opened
550 self.opentags = self.opentags[:tagindex + 1]
552 self.handle_curdata()
553 if self.opentags[-1] == tag:
556 def handle_data(self, data):
557 if len(self.opentags) == 0:
558 self.opentags.append(u'p')
559 self.curdata = self.curdata + data.decode("utf-8")
561 def handle_charref(self, name):
563 entity = unichr(int(name))
567 entity = unichr(int('0%s' %(name,), 16))
569 entity = u'#%s' %(name,)
571 entity = u'#%s' %(name,)
572 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
575 def handle_entityref(self, name):
577 if HTML2Text.entities.has_key(name):
578 entity = HTML2Text.entities[name]
580 entity = "&" + name + ";"
582 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
586 self.handle_curdata()
587 if len(self.text) == 0 or self.text[-1] != u'\n':
588 self.text = self.text + u'\n'
590 if len(self.text) > 0:
591 while len(self.text) > 1 and self.text[-1] == u'\n':
592 self.text = self.text[:-1]
593 self.text = self.text + u'\n'
594 if len(self.urls) > 0:
595 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
597 if len(self.images.keys()) > 0:
598 self.text = self.text + u'\n.. ' \
600 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
601 for a in self.images.keys()]) + u'\n'
605 def open_url(method, url):
607 while redirectcount < 3:
608 (type, rest) = urllib.splittype(url)
609 (host, path) = urllib.splithost(rest)
610 (host, port) = urllib.splitport(host)
614 conn = httplib.HTTPConnection("%s:%s" %(host, port))
615 conn.request(method, path)
616 response = conn.getresponse()
617 if response.status in [301, 302, 303, 307]:
618 headers = response.getheaders()
619 for header in headers:
620 if header[0] == "location":
622 elif response.status == 200:
626 redirectcount = redirectcount + 1
629 def parse_and_deliver(maildir, url, statedir):
632 # first check if we know about this feed already
633 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
634 if feeddb.has_key(url):
636 data = cgi.parse_qs(data)
637 response = open_url("HEAD", url)
640 headers = response.getheaders()
643 for header in headers:
644 if header[0] == "content-length":
645 if header[1] != data["content-length"][0]:
647 elif header[0] == "etag":
648 if header[1] != data["etag"][0]:
650 elif header[0] == "last-modified":
651 if header[1] != data["last-modified"][0]:
653 elif header[0] == "content-md5":
654 if header[1] != data["content-md5"][0]:
659 response = open_url("GET", url)
661 headers = response.getheaders()
662 feedhandle = response
664 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
667 return # don't need to do anything, nothings changed.
669 response = open_url("GET", url)
671 headers = response.getheaders()
672 feedhandle = response
674 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
677 fp = feedparser.parse(feedhandle)
678 db = dbm.open(os.path.join(statedir, "seen"), "c")
679 for item in fp["items"]:
680 # have we seen it before?
681 # need to work out what the content is first...
683 if item.has_key("content"):
684 content = item["content"][0]["value"]
686 if item.has_key("description"):
687 content = item["description"]
691 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
696 db_link_key = (url + u'|' + item["link"]).encode("utf-8")
698 # check if there's a guid too - if that exists and we match the md5,
700 if item.has_key("guid"):
701 db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
702 if db.has_key(db_guid_key):
703 data = db[db_guid_key]
704 data = cgi.parse_qs(data)
705 if data["contentmd5"][0] == md5sum:
708 if db.has_key(db_link_key):
709 data = db[db_link_key]
710 data = cgi.parse_qs(data)
711 if data.has_key("message-id"):
712 prevmessageid = data["message-id"][0]
713 if data["contentmd5"][0] == md5sum:
717 author = item["author"]
721 # create a basic email message
722 msg = MIMEMultipart("alternative")
724 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
728 string.ascii_letters + string.digits \
729 ) for a in range(0,6) \
730 ]) + "@" + socket.gethostname() + ">"
731 msg.add_header("Message-ID", messageid)
732 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
733 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
734 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
736 msg.add_header("References", prevmessageid)
737 createddate = datetime.datetime.now() \
738 .strftime("%a, %e %b %Y %T -0000")
740 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
741 .strftime("%a, %e %b %Y %T -0000")
744 msg.add_header("Date", createddate)
745 msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
746 .strftime("%a, %e %b %Y %T -0000")
747 subj_gen = HTML2Text()
748 title = item["title"]
749 title = re.sub(u'<', u'<', title)
750 title = re.sub(u'>', u'>', title)
751 subj_gen.feed(title.encode("utf-8"))
752 msg.add_header("Subject", subj_gen.gettext())
753 msg.set_default_type("text/plain")
755 htmlcontent = content.encode("utf-8")
756 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
760 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
761 textparser = HTML2Text()
762 textparser.feed(content.encode("utf-8"))
763 textcontent = textparser.gettext()
764 textcontent = "%s\n\nItem URL: %s" %( \
767 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
771 # start by working out the filename we should be writting to, we do
772 # this following the normal maildir style rules
773 fname = str(os.getpid()) \
774 + "." + socket.gethostname() \
777 string.ascii_letters + string.digits \
778 ) for a in range(0,10) \
780 + datetime.datetime.now().strftime('%s')
781 fn = os.path.join(maildir, "tmp", fname)
783 fh.write(msg.as_string())
785 # now move it in to the new directory
786 newfn = os.path.join(maildir, "new", fname)
790 # now add to the database about the item
792 messageid = prevmessageid + " " + messageid
793 if item.has_key("guid") and item["guid"] != item["link"]:
794 data = urllib.urlencode(( \
795 ("message-id", messageid), \
796 ("created", createddate), \
797 ("contentmd5", md5sum) \
799 db[db_guid_key] = data
801 data = db[db_link_key]
802 data = cgi.parse_qs(data)
803 newdata = urllib.urlencode(( \
804 ("message-id", messageid), \
805 ("created", data["created"][0]), \
806 ("contentmd5", data["contentmd5"][0]) \
808 db[db_link_key] = newdata
810 db[db_link_key] = data
812 data = urllib.urlencode(( \
813 ("message-id", messageid), \
814 ("created", createddate), \
815 ("contentmd5", md5sum) \
817 db[db_link_key] = data
821 for header in headers:
823 ["content-md5", "etag", "last-modified", "content-length"]:
824 data.append((header[0], header[1]))
826 data = urllib.urlencode(data)
832 if __name__ == "__main__":
833 # This only gets executed if we really called the program
834 # first off, parse the command line arguments
836 oparser = OptionParser()
838 "-c", "--conf", dest="conf",
839 help="location of config file"
842 "-s", "--statedir", dest="statedir",
843 help="location of directory to store state in"
846 (options, args) = oparser.parse_args()
848 # check for the configfile
852 if options.conf != None:
853 # does the file exist?
855 os.stat(options.conf)
856 configfile = options.conf
858 # should exit here as the specified file doesn't exist
860 "Config file %s does not exist. Exiting.\n" %(options.conf,))
863 # check through the default locations
865 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
866 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
869 os.stat("/etc/rss2maildir.conf")
870 configfile = "/etc/rss2maildir.conf"
872 sys.stderr.write("No config file found. Exiting.\n")
875 # Right - if we've got this far, we've got a config file, now for the hard
878 scp = SafeConfigParser()
881 maildir_root = "RSSMaildir"
884 if options.statedir != None:
885 state_dir = options.statedir
887 mode = os.stat(state_dir)[stat.ST_MODE]
888 if not stat.S_ISDIR(mode):
890 "State directory (%s) is not a directory\n" %(state_dir))
893 # try to make the directory
897 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
899 elif scp.has_option("general", "state_dir"):
900 new_state_dir = scp.get("general", "state_dir")
902 mode = os.stat(new_state_dir)[stat.ST_MODE]
903 if not stat.S_ISDIR(mode):
905 "State directory (%s) is not a directory\n" %(state_dir))
908 state_dir = new_state_dir
912 os.mkdir(new_state_dir)
913 state_dir = new_state_dir
916 "Couldn't create state directory %s\n" %(new_state_dir))
920 mode = os.stat(state_dir)[stat.ST_MODE]
921 if not stat.S_ISDIR(mode):
923 "State directory %s is not a directory\n" %(state_dir))
930 "State directory %s could not be created\n" %(state_dir))
933 if scp.has_option("general", "maildir_root"):
934 maildir_root = scp.get("general", "maildir_root")
937 mode = os.stat(maildir_root)[stat.ST_MODE]
938 if not stat.S_ISDIR(mode):
940 "Maildir Root %s is not a directory\n" \
945 os.mkdir(maildir_root)
947 sys.stderr.write("Couldn't create Maildir Root %s\n" \
951 feeds = scp.sections()
953 feeds.remove("general")
957 for section in feeds:
958 # check if the directory exists
961 maildir = scp.get(section, "maildir")
965 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
966 maildir = os.path.join(maildir_root, maildir)
969 exists = os.stat(maildir)
970 if stat.S_ISDIR(exists[stat.ST_MODE]):
971 # check if there's a new, cur and tmp directory
973 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
975 os.mkdir(os.path.join(maildir, "cur"))
976 if not stat.S_ISDIR(mode):
977 sys.stderr.write("Broken maildir: %s\n" %(maildir))
979 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
981 os.mkdir(os.path.join(maildir, "tmp"))
982 if not stat.S_ISDIR(mode):
983 sys.stderr.write("Broken maildir: %s\n" %(maildir))
985 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
986 if not stat.S_ISDIR(mode):
987 sys.stderr.write("Broken maildir: %s\n" %(maildir))
989 os.mkdir(os.path.join(maildir, "new"))
991 sys.stderr.write("Broken maildir: %s\n" %(maildir))
996 sys.stderr.write("Couldn't create root maildir %s\n" \
1000 os.mkdir(os.path.join(maildir, "new"))
1001 os.mkdir(os.path.join(maildir, "cur"))
1002 os.mkdir(os.path.join(maildir, "tmp"))
1005 "Couldn't create required maildir directories for %s\n" \
1009 # right - we've got the directories, we've got the section, we know the
1012 parse_and_deliver(maildir, section, state_dir)