4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
45 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
55 from HTMLParser import HTMLParser
57 class HTML2Text(HTMLParser):
215 def __init__(self,textwidth=70):
218 self.textwidth = textwidth
221 self.ignorenodata = False
225 HTMLParser.__init__(self)
227 def handle_starttag(self, tag, attrs):
228 tag_name = tag.lower()
229 if tag_name in self.blockleveltags:
230 # handle starting a new block - unless we're in a block element
231 # that can contain other blocks, we'll assume that we want to close
233 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
234 self.handle_curdata()
236 if tag_name == u'ol':
237 self.handle_curdata()
238 self.listcount.append(1)
239 self.listlevel = len(self.listcount) - 1
241 if tag_name == u'dl':
242 self.indentlevel = self.indentlevel + 4
244 if tag_name in self.liststarttags:
245 smallist = self.opentags[-3:-1]
247 for prev_listtag in smallist:
248 if prev_listtag in [u'dl', u'ol']:
249 self.indentlevel = self.indentlevel + 4
251 elif prev_listtag == u'ul':
252 self.indentlevel = self.indentlevel + 3
255 if len(self.opentags) > 0:
256 self.handle_curdata()
257 if tag_name not in self.cancontainflow:
259 self.opentags.append(tag_name)
261 if tag_name == "span":
265 listcount = self.listcount[-1]
269 if tag_name == u'dd' and len(self.opentags) > 1 \
270 and self.opentags[-1] == u'dt':
271 self.handle_curdata()
273 elif tag_name == u'dt' and len(self.opentags) > 1 \
274 and self.opentags[-1] == u'dd':
275 self.handle_curdata()
277 elif tag_name == u'a':
279 if attr[0].lower() == u'href':
280 self.urls.append(attr[1])
281 self.curdata = self.curdata + u'`'
282 self.opentags.append(tag_name)
284 elif tag_name == u'img':
285 self.handle_image(attrs)
287 elif tag_name == u'br':
291 # we don't know the tag, so lets avoid handling it!
294 def handle_startendtag(self, tag, attrs):
295 if tag.lower() == u'br':
297 elif tag.lower() == u'img':
298 self.handle_image(attrs)
302 self.handle_curdata()
303 self.opentags.append(u'br')
304 self.handle_curdata()
307 def handle_image(self, attrs):
312 if isinstance(attr[1], str):
313 alt = u'%s' %(attr[1])
316 elif attr[0] == 'src':
317 if isinstance(attr[1], str):
318 url = u'%s' %(attr[1])
323 if self.images.has_key(alt):
324 if self.images[alt]["url"] == url:
325 self.curdata = self.curdata \
328 while self.images.has_key(alt):
330 self.images[alt] = {"url": url}
331 self.curdata = self.curdata \
334 self.images[alt] = {"url": url}
335 self.curdata = self.curdata \
338 if self.images.has_key(url):
339 self.curdata = self.curdata \
342 self.images[url] = {}
343 self.images[url]["url"] =url
344 self.curdata = self.curdata \
347 def handle_curdata(self):
349 if len(self.opentags) == 0:
352 tag_thats_done = self.opentags[-1]
354 if len(self.curdata) == 0:
357 if tag_thats_done == u'br':
358 if len(self.text) == 0 or self.text[-1] != '\n':
359 self.text = self.text + '\n'
360 self.ignorenodata = True
363 if len(self.curdata.strip()) == 0:
366 if tag_thats_done in self.blockleveltags:
367 newlinerequired = self.text != u''
368 if self.ignorenodata:
369 newlinerequired = False
370 self.ignorenodata = False
372 if tag_thats_done in [u'dt', u'dd', u'li'] \
373 and len(self.text) > 1 \
374 and self.text[-1] != u'\n':
375 self.text = self.text + u'\n'
376 elif len(self.text) > 2 \
377 and self.text[-1] != u'\n' \
378 and self.text[-2] != u'\n':
379 self.text = self.text + u'\n\n'
381 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
384 headingtext = " ".join(self.curdata.split())
385 seperator = u'\n' + u' '*self.indentlevel
386 headingtext = seperator.join( \
389 self.textwidth - self.indentlevel \
393 if tag_thats_done == u'h2':
395 elif tag_thats_done != u'h1':
398 if u'\n' in headingtext:
399 underline = u' ' * self.indentlevel \
400 + underlinechar * (self.textwidth - self.indentlevel)
402 underline = u' ' * self.indentlevel \
403 + underlinechar * len(headingtext)
404 self.text = self.text \
405 + headingtext + u'\n' \
407 elif tag_thats_done in [u'p', u'div']:
408 paragraph = unicode( \
409 " ".join(self.curdata.strip().encode("utf-8").split()), \
411 seperator = u'\n' + u' ' * self.indentlevel
412 self.text = self.text \
413 + u' ' * self.indentlevel \
416 paragraph, self.textwidth - self.indentlevel))
417 elif tag_thats_done == "pre":
418 self.text = self.text + unicode( \
419 self.curdata.encode("utf-8"), "utf-8")
420 elif tag_thats_done == u'blockquote':
422 " ".join(self.curdata.encode("utf-8").strip().split()), \
424 seperator = u'\n' + u' ' * self.indentlevel + u' '
425 if len(self.text) > 0 and self.text[-1] != u'\n':
426 self.text = self.text + u'\n'
427 self.text = self.text \
432 self.textwidth - self.indentlevel - 2 \
436 elif tag_thats_done == "li":
437 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
438 if len(self.text) > 0 and self.text[-1] != u'\n':
439 self.text = self.text + u'\n'
440 # work out if we're in an ol rather than a ul
441 latesttags = self.opentags[-4:]
444 for thing in latesttags:
458 listmarker = u' %2d. ' %(self.listcount[-1])
459 self.listcount[-1] = self.listcount[-1] + 1
462 + u' ' * self.indentlevel \
464 self.text = self.text \
465 + u' ' * self.indentlevel \
470 self.textwidth - self.indentlevel - listindent \
474 elif tag_thats_done == u'dt':
475 definition = unicode(" ".join( \
476 self.curdata.encode("utf-8").strip().split()), \
478 if len(self.text) > 0 and self.text[-1] != u'\n':
479 self.text = self.text + u'\n\n'
480 elif len(self.text) > 1 and self.text[-2] != u'\n':
481 self.text = self.text + u'\n'
482 definition = u' ' * (self.indentlevel - 4) + definition + "::"
483 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
484 self.text = self.text \
486 textwrap.wrap(definition, \
487 self.textwidth - self.indentlevel - 4))
489 elif tag_thats_done == u'dd':
490 definition = unicode(" ".join( \
491 self.curdata.encode("utf-8").strip().split()),
493 if len(definition) > 0:
494 if len(self.text) > 0 and self.text[-1] != u'\n':
495 self.text = self.text + u'\n'
496 indentstring = u'\n' + u' ' * self.indentlevel
497 self.text = self.text \
499 + indentstring.join( \
502 self.textwidth - self.indentlevel \
506 elif tag_thats_done == u'a':
507 self.curdata = self.curdata + u'`__'
509 elif tag_thats_done in self.liststarttags:
512 if tag_thats_done in self.blockleveltags:
515 self.ignorenodata = False
517 def handle_endtag(self, tag):
518 self.ignorenodata = False
523 tagindex = self.opentags.index(tag)
528 if tag in [u'br', u'img']:
532 self.indentlevel = self.indentlevel - 4
534 if tag in self.liststarttags:
535 if tag in [u'ol', u'dl', u'ul', u'dd']:
536 self.handle_curdata()
537 # find if there was a previous list level
538 smalllist = self.opentags[:-1]
540 for prev_listtag in smalllist:
541 if prev_listtag in [u'ol', u'dl']:
542 self.indentlevel = self.indentlevel - 4
544 elif prev_listtag == u'ul':
545 self.indentlevel = self.indentlevel - 3
549 self.listcount = self.listcount[:-1]
551 while tagindex < len(self.opentags) \
552 and tag in self.opentags[tagindex+1:]:
554 tagindex = self.opentags.index(tag, tagindex+1)
556 # well, we don't want to do that then
558 if tagindex != len(self.opentags) - 1:
559 # Assuming the data was for the last opened tag first
560 self.handle_curdata()
561 # Now kill the list to be a slice before this tag was opened
562 self.opentags = self.opentags[:tagindex + 1]
564 self.handle_curdata()
565 if self.opentags[-1] == tag:
568 def handle_data(self, data):
569 if len(self.opentags) == 0:
570 self.opentags.append(u'p')
571 self.curdata = "%s%s" %(self.curdata, data)
573 def handle_charref(self, name):
575 entity = unichr(int(name))
579 entity = unichr(int('0%s' %(name,), 16))
581 entity = u'#%s' %(name,)
583 entity = u'#%s' %(name,)
584 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
587 def handle_entityref(self, name):
589 if HTML2Text.entities.has_key(name):
590 entity = HTML2Text.entities[name]
592 entity = "&" + name + ";"
594 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
598 self.handle_curdata()
599 if len(self.text) == 0 or self.text[-1] != u'\n':
600 self.text = self.text + u'\n'
602 if len(self.text) > 0:
603 while len(self.text) > 1 and self.text[-1] == u'\n':
604 self.text = self.text[:-1]
605 self.text = self.text + u'\n'
606 if len(self.urls) > 0:
607 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
609 if len(self.images.keys()) > 0:
610 self.text = self.text + u'\n.. ' \
612 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
613 for a in self.images.keys()]) + u'\n'
617 def open_url(method, url):
619 while redirectcount < 3:
620 (type, rest) = urllib.splittype(url)
621 (host, path) = urllib.splithost(rest)
622 (host, port) = urllib.splitport(host)
631 conn = httplib.HTTPConnection("%s:%s" %(host, port))
633 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
634 conn.request(method, path)
635 response = conn.getresponse()
636 if response.status in [301, 302, 303, 307]:
637 headers = response.getheaders()
638 for header in headers:
639 if header[0] == "location":
641 elif response.status == 200:
645 redirectcount = redirectcount + 1
648 def parse_and_deliver(maildir, url, statedir):
651 # first check if we know about this feed already
652 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
653 if feeddb.has_key(url):
655 data = cgi.parse_qs(data)
656 response = open_url("HEAD", url)
659 headers = response.getheaders()
662 for header in headers:
663 if header[0] == "content-length":
664 if header[1] != data["content-length"][0]:
666 elif header[0] == "etag":
667 if header[1] != data["etag"][0]:
669 elif header[0] == "last-modified":
670 if header[1] != data["last-modified"][0]:
672 elif header[0] == "content-md5":
673 if header[1] != data["content-md5"][0]:
678 response = open_url("GET", url)
680 headers = response.getheaders()
681 feedhandle = response
683 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
686 return # don't need to do anything, nothings changed.
688 response = open_url("GET", url)
690 headers = response.getheaders()
691 feedhandle = response
693 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
696 fp = feedparser.parse(feedhandle)
697 db = dbm.open(os.path.join(statedir, "seen"), "c")
698 for item in fp["items"]:
699 # have we seen it before?
700 # need to work out what the content is first...
702 if item.has_key("content"):
703 content = item["content"][0]["value"]
705 if item.has_key("description"):
706 content = item["description"]
710 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
712 # make sure content is unicode encoded
713 if not isinstance(content, unicode):
714 cd_res = chardet.detect(content)
715 chrset = cd_res['encoding']
716 print "detected charset %s for item %s" %(chrset, item["link"])
717 content = content.decode(chrset)
722 if not item.has_key("link"):
723 item["link"] = u'#' + md5sum
724 db_link_key = (url + u'|' + item["link"]).encode("utf-8")
726 # check if there's a guid too - if that exists and we match the md5,
728 if item.has_key("guid"):
729 db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
730 if db.has_key(db_guid_key):
731 data = db[db_guid_key]
732 data = cgi.parse_qs(data)
733 if data["contentmd5"][0] == md5sum:
736 if db.has_key(db_link_key):
737 data = db[db_link_key]
738 data = cgi.parse_qs(data)
739 if data.has_key("message-id"):
740 prevmessageid = data["message-id"][0]
741 if data["contentmd5"][0] == md5sum:
745 author = item["author"]
749 # create a basic email message
750 msg = MIMEMultipart("alternative")
752 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
756 string.ascii_letters + string.digits \
757 ) for a in range(0,6) \
758 ]) + "@" + socket.gethostname() + ">"
759 msg.add_header("Message-ID", messageid)
760 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
761 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
762 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
764 msg.add_header("References", prevmessageid)
765 createddate = datetime.datetime.now() \
766 .strftime("%a, %e %b %Y %T -0000")
768 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
769 .strftime("%a, %e %b %Y %T -0000")
772 msg.add_header("Date", createddate)
773 msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
774 .strftime("%a, %e %b %Y %T -0000"))
775 subj_gen = HTML2Text()
776 title = item["title"]
777 title = re.sub(u'<', u'<', title)
778 title = re.sub(u'>', u'>', title)
780 msg.add_header("Subject", subj_gen.gettext())
781 msg.set_default_type("text/plain")
783 htmlcontent = content.encode("utf-8")
784 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
788 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
789 textparser = HTML2Text()
790 textparser.feed(content)
791 textcontent = textparser.gettext()
792 textcontent = "%s\n\nItem URL: %s" %( \
795 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
799 # start by working out the filename we should be writting to, we do
800 # this following the normal maildir style rules
801 fname = str(os.getpid()) \
802 + "." + socket.gethostname() \
805 string.ascii_letters + string.digits \
806 ) for a in range(0,10) \
808 + datetime.datetime.now().strftime('%s')
809 fn = os.path.join(maildir, "tmp", fname)
811 fh.write(msg.as_string())
813 # now move it in to the new directory
814 newfn = os.path.join(maildir, "new", fname)
818 # now add to the database about the item
820 messageid = prevmessageid + " " + messageid
821 if item.has_key("guid") and item["guid"] != item["link"]:
822 data = urllib.urlencode(( \
823 ("message-id", messageid), \
824 ("created", createddate), \
825 ("contentmd5", md5sum) \
827 db[db_guid_key] = data
829 data = db[db_link_key]
830 data = cgi.parse_qs(data)
831 newdata = urllib.urlencode(( \
832 ("message-id", messageid), \
833 ("created", data["created"][0]), \
834 ("contentmd5", data["contentmd5"][0]) \
836 db[db_link_key] = newdata
838 db[db_link_key] = data
840 data = urllib.urlencode(( \
841 ("message-id", messageid), \
842 ("created", createddate), \
843 ("contentmd5", md5sum) \
845 db[db_link_key] = data
849 for header in headers:
851 ["content-md5", "etag", "last-modified", "content-length"]:
852 data.append((header[0], header[1]))
854 data = urllib.urlencode(data)
860 if __name__ == "__main__":
861 # This only gets executed if we really called the program
862 # first off, parse the command line arguments
864 oparser = OptionParser()
866 "-c", "--conf", dest="conf",
867 help="location of config file"
870 "-s", "--statedir", dest="statedir",
871 help="location of directory to store state in"
874 (options, args) = oparser.parse_args()
876 # check for the configfile
880 if options.conf != None:
881 # does the file exist?
883 os.stat(options.conf)
884 configfile = options.conf
886 # should exit here as the specified file doesn't exist
888 "Config file %s does not exist. Exiting.\n" %(options.conf,))
891 # check through the default locations
893 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
894 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
897 os.stat("/etc/rss2maildir.conf")
898 configfile = "/etc/rss2maildir.conf"
900 sys.stderr.write("No config file found. Exiting.\n")
903 # Right - if we've got this far, we've got a config file, now for the hard
906 scp = SafeConfigParser()
909 maildir_root = "RSSMaildir"
912 if options.statedir != None:
913 state_dir = options.statedir
915 mode = os.stat(state_dir)[stat.ST_MODE]
916 if not stat.S_ISDIR(mode):
918 "State directory (%s) is not a directory\n" %(state_dir))
921 # try to make the directory
925 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
927 elif scp.has_option("general", "state_dir"):
928 new_state_dir = scp.get("general", "state_dir")
930 mode = os.stat(new_state_dir)[stat.ST_MODE]
931 if not stat.S_ISDIR(mode):
933 "State directory (%s) is not a directory\n" %(state_dir))
936 state_dir = new_state_dir
940 os.mkdir(new_state_dir)
941 state_dir = new_state_dir
944 "Couldn't create state directory %s\n" %(new_state_dir))
948 mode = os.stat(state_dir)[stat.ST_MODE]
949 if not stat.S_ISDIR(mode):
951 "State directory %s is not a directory\n" %(state_dir))
958 "State directory %s could not be created\n" %(state_dir))
961 if scp.has_option("general", "maildir_root"):
962 maildir_root = scp.get("general", "maildir_root")
965 mode = os.stat(maildir_root)[stat.ST_MODE]
966 if not stat.S_ISDIR(mode):
968 "Maildir Root %s is not a directory\n" \
973 os.mkdir(maildir_root)
975 sys.stderr.write("Couldn't create Maildir Root %s\n" \
979 feeds = scp.sections()
981 feeds.remove("general")
985 for section in feeds:
986 # check if the directory exists
989 maildir = scp.get(section, "maildir")
993 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
994 maildir = os.path.join(maildir_root, maildir)
997 exists = os.stat(maildir)
998 if stat.S_ISDIR(exists[stat.ST_MODE]):
999 # check if there's a new, cur and tmp directory
1001 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
1003 os.mkdir(os.path.join(maildir, "cur"))
1004 if not stat.S_ISDIR(mode):
1005 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1007 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
1009 os.mkdir(os.path.join(maildir, "tmp"))
1010 if not stat.S_ISDIR(mode):
1011 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1013 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
1014 if not stat.S_ISDIR(mode):
1015 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1017 os.mkdir(os.path.join(maildir, "new"))
1019 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1024 sys.stderr.write("Couldn't create root maildir %s\n" \
1028 os.mkdir(os.path.join(maildir, "new"))
1029 os.mkdir(os.path.join(maildir, "cur"))
1030 os.mkdir(os.path.join(maildir, "tmp"))
1033 "Couldn't create required maildir directories for %s\n" \
1037 # right - we've got the directories, we've got the section, we know the
1040 parse_and_deliver(maildir, section, state_dir)