4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
53 from HTMLParser import HTMLParser
55 class HTML2Text(HTMLParser):
213 def __init__(self,textwidth=70):
216 self.textwidth = textwidth
219 self.ignorenodata = False
223 HTMLParser.__init__(self)
225 def handle_starttag(self, tag, attrs):
226 tag_name = tag.lower()
227 if tag_name in self.blockleveltags:
228 # handle starting a new block - unless we're in a block element
229 # that can contain other blocks, we'll assume that we want to close
231 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232 self.handle_curdata()
234 if tag_name == u'ol':
235 self.handle_curdata()
236 self.listcount.append(1)
237 self.listlevel = len(self.listcount) - 1
239 if tag_name == u'dl':
240 self.indentlevel = self.indentlevel + 4
242 if tag_name in self.liststarttags:
243 smallist = self.opentags[-3:-1]
245 for prev_listtag in smallist:
246 if prev_listtag in [u'dl', u'ol']:
247 self.indentlevel = self.indentlevel + 4
249 elif prev_listtag == u'ul':
250 self.indentlevel = self.indentlevel + 3
253 if len(self.opentags) > 0:
254 self.handle_curdata()
255 if tag_name not in self.cancontainflow:
257 self.opentags.append(tag_name)
259 if tag_name == "span":
263 listcount = self.listcount[-1]
267 if tag_name == u'dd' and len(self.opentags) > 1 \
268 and self.opentags[-1] == u'dt':
269 self.handle_curdata()
271 elif tag_name == u'dt' and len(self.opentags) > 1 \
272 and self.opentags[-1] == u'dd':
273 self.handle_curdata()
275 elif tag_name == u'a':
277 if attr[0].lower() == u'href':
278 self.urls.append(attr[1].decode('utf-8'))
279 self.curdata = self.curdata + u'`'
280 self.opentags.append(tag_name)
282 elif tag_name == u'img':
283 self.handle_image(attrs)
285 elif tag_name == u'br':
289 # we don't know the tag, so lets avoid handling it!
292 def handle_startendtag(self, tag, attrs):
293 if tag.lower() == u'br':
295 elif tag.lower() == u'img':
296 self.handle_image(attrs)
300 self.handle_curdata()
301 self.opentags.append(u'br')
302 self.handle_curdata()
305 def handle_image(self, attrs):
310 if isinstance(attr[1], str):
311 alt = u'%s' %(attr[1].decode("utf-8"))
314 elif attr[0] == 'src':
315 if isinstance(attr[1], str):
316 url = u'%s' %(attr[1].decode("utf-8"))
321 if self.images.has_key(alt):
322 if self.images[alt]["url"] == url:
323 self.curdata = self.curdata \
326 while self.images.has_key(alt):
328 self.images[alt] = {"url": url}
329 self.curdata = self.curdata \
332 self.images[alt] = {"url": url}
333 self.curdata = self.curdata \
336 if self.images.has_key(url):
337 self.curdata = self.curdata \
340 self.images[url] = {}
341 self.images[url]["url"] =url
342 self.curdata = self.curdata \
345 def handle_curdata(self):
347 if len(self.opentags) == 0:
350 tag_thats_done = self.opentags[-1]
352 if len(self.curdata) == 0:
355 if tag_thats_done == u'br':
356 if len(self.text) == 0 or self.text[-1] != '\n':
357 self.text = self.text + '\n'
358 self.ignorenodata = True
361 if len(self.curdata.strip()) == 0:
364 if tag_thats_done in self.blockleveltags:
365 newlinerequired = self.text != u''
366 if self.ignorenodata:
367 newlinerequired = False
368 self.ignorenodata = False
370 if tag_thats_done in [u'dt', u'dd', u'li'] \
371 and len(self.text) > 1 \
372 and self.text[-1] != u'\n':
373 self.text = self.text + u'\n'
374 elif len(self.text) > 2 \
375 and self.text[-1] != u'\n' \
376 and self.text[-2] != u'\n':
377 self.text = self.text + u'\n\n'
379 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
382 headingtext = " ".join(self.curdata.split())
383 seperator = u'\n' + u' '*self.indentlevel
384 headingtext = seperator.join( \
387 self.textwidth - self.indentlevel \
391 if tag_thats_done == u'h2':
393 elif tag_thats_done != u'h1':
396 if u'\n' in headingtext:
397 underline = u' ' * self.indentlevel \
398 + underlinechar * (self.textwidth - self.indentlevel)
400 underline = u' ' * self.indentlevel \
401 + underlinechar * len(headingtext)
402 self.text = self.text \
403 + headingtext + u'\n' \
405 elif tag_thats_done in [u'p', u'div']:
406 paragraph = unicode( \
407 " ".join(self.curdata.strip().encode("utf-8").split()), \
409 seperator = u'\n' + u' ' * self.indentlevel
410 self.text = self.text \
411 + u' ' * self.indentlevel \
414 paragraph, self.textwidth - self.indentlevel))
415 elif tag_thats_done == "pre":
416 self.text = self.text + unicode( \
417 self.curdata.encode("utf-8"), "utf-8")
418 elif tag_thats_done == u'blockquote':
420 " ".join(self.curdata.encode("utf-8").strip().split()), \
422 seperator = u'\n' + u' ' * self.indentlevel + u' '
423 if len(self.text) > 0 and self.text[-1] != u'\n':
424 self.text = self.text + u'\n'
425 self.text = self.text \
430 self.textwidth - self.indentlevel - 2 \
434 elif tag_thats_done == "li":
435 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
436 if len(self.text) > 0 and self.text[-1] != u'\n':
437 self.text = self.text + u'\n'
438 # work out if we're in an ol rather than a ul
439 latesttags = self.opentags[-4:]
442 for thing in latesttags:
456 listmarker = u' %2d. ' %(self.listcount[-1])
457 self.listcount[-1] = self.listcount[-1] + 1
460 + u' ' * self.indentlevel \
462 self.text = self.text \
463 + u' ' * self.indentlevel \
468 self.textwidth - self.indentlevel - listindent \
472 elif tag_thats_done == u'dt':
473 definition = unicode(" ".join( \
474 self.curdata.encode("utf-8").strip().split()), \
476 if len(self.text) > 0 and self.text[-1] != u'\n':
477 self.text = self.text + u'\n\n'
478 elif len(self.text) > 1 and self.text[-2] != u'\n':
479 self.text = self.text + u'\n'
480 definition = u' ' * (self.indentlevel - 4) + definition + "::"
481 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
482 self.text = self.text \
484 textwrap.wrap(definition, \
485 self.textwidth - self.indentlevel - 4))
487 elif tag_thats_done == u'dd':
488 definition = unicode(" ".join( \
489 self.curdata.encode("utf-8").strip().split()),
491 if len(definition) > 0:
492 if len(self.text) > 0 and self.text[-1] != u'\n':
493 self.text = self.text + u'\n'
494 indentstring = u'\n' + u' ' * self.indentlevel
495 self.text = self.text \
497 + indentstring.join( \
500 self.textwidth - self.indentlevel \
504 elif tag_thats_done == u'a':
505 self.curdata = self.curdata + u'`__'
507 elif tag_thats_done in self.liststarttags:
510 if tag_thats_done in self.blockleveltags:
513 self.ignorenodata = False
515 def handle_endtag(self, tag):
516 self.ignorenodata = False
521 tagindex = self.opentags.index(tag)
526 if tag in [u'br', u'img']:
530 self.indentlevel = self.indentlevel - 4
532 if tag in self.liststarttags:
533 if tag in [u'ol', u'dl', u'ul', u'dd']:
534 self.handle_curdata()
535 # find if there was a previous list level
536 smalllist = self.opentags[:-1]
538 for prev_listtag in smalllist:
539 if prev_listtag in [u'ol', u'dl']:
540 self.indentlevel = self.indentlevel - 4
542 elif prev_listtag == u'ul':
543 self.indentlevel = self.indentlevel - 3
547 self.listcount = self.listcount[:-1]
549 while tagindex < len(self.opentags) \
550 and tag in self.opentags[tagindex+1:]:
552 tagindex = self.opentags.index(tag, tagindex+1)
554 # well, we don't want to do that then
556 if tagindex != len(self.opentags) - 1:
557 # Assuming the data was for the last opened tag first
558 self.handle_curdata()
559 # Now kill the list to be a slice before this tag was opened
560 self.opentags = self.opentags[:tagindex + 1]
562 self.handle_curdata()
563 if self.opentags[-1] == tag:
566 def handle_data(self, data):
567 if len(self.opentags) == 0:
568 self.opentags.append(u'p')
569 self.curdata = self.curdata + data.decode("utf-8")
571 def handle_charref(self, name):
573 entity = unichr(int(name))
577 entity = unichr(int('0%s' %(name,), 16))
579 entity = u'#%s' %(name,)
581 entity = u'#%s' %(name,)
582 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
585 def handle_entityref(self, name):
587 if HTML2Text.entities.has_key(name):
588 entity = HTML2Text.entities[name]
590 entity = "&" + name + ";"
592 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
596 self.handle_curdata()
597 if len(self.text) == 0 or self.text[-1] != u'\n':
598 self.text = self.text + u'\n'
600 if len(self.text) > 0:
601 while len(self.text) > 1 and self.text[-1] == u'\n':
602 self.text = self.text[:-1]
603 self.text = self.text + u'\n'
604 if len(self.urls) > 0:
605 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
607 if len(self.images.keys()) > 0:
608 self.text = self.text + u'\n.. ' \
610 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
611 for a in self.images.keys()]) + u'\n'
615 def open_url(method, url):
617 while redirectcount < 3:
618 (type, rest) = urllib.splittype(url)
619 (host, path) = urllib.splithost(rest)
620 (host, port) = urllib.splitport(host)
629 conn = httplib.HTTPConnection("%s:%s" %(host, port))
631 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
632 conn.request(method, path)
633 response = conn.getresponse()
634 if response.status in [301, 302, 303, 307]:
635 headers = response.getheaders()
636 for header in headers:
637 if header[0] == "location":
639 elif response.status == 200:
643 redirectcount = redirectcount + 1
646 def parse_and_deliver(maildir, url, statedir):
649 # first check if we know about this feed already
650 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
651 if feeddb.has_key(url):
653 data = cgi.parse_qs(data)
654 response = open_url("HEAD", url)
657 headers = response.getheaders()
660 for header in headers:
661 if header[0] == "content-length":
662 if header[1] != data["content-length"][0]:
664 elif header[0] == "etag":
665 if header[1] != data["etag"][0]:
667 elif header[0] == "last-modified":
668 if header[1] != data["last-modified"][0]:
670 elif header[0] == "content-md5":
671 if header[1] != data["content-md5"][0]:
676 response = open_url("GET", url)
678 headers = response.getheaders()
679 feedhandle = response
681 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
684 return # don't need to do anything, nothings changed.
686 response = open_url("GET", url)
688 headers = response.getheaders()
689 feedhandle = response
691 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
694 fp = feedparser.parse(feedhandle)
695 db = dbm.open(os.path.join(statedir, "seen"), "c")
696 for item in fp["items"]:
697 # have we seen it before?
698 # need to work out what the content is first...
700 if item.has_key("content"):
701 content = item["content"][0]["value"]
703 if item.has_key("description"):
704 content = item["description"]
708 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
713 db_link_key = (url + u'|' + item["link"]).encode("utf-8")
715 # check if there's a guid too - if that exists and we match the md5,
717 if item.has_key("guid"):
718 db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
719 if db.has_key(db_guid_key):
720 data = db[db_guid_key]
721 data = cgi.parse_qs(data)
722 if data["contentmd5"][0] == md5sum:
725 if db.has_key(db_link_key):
726 data = db[db_link_key]
727 data = cgi.parse_qs(data)
728 if data.has_key("message-id"):
729 prevmessageid = data["message-id"][0]
730 if data["contentmd5"][0] == md5sum:
734 author = item["author"]
738 # create a basic email message
739 msg = MIMEMultipart("alternative")
741 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
745 string.ascii_letters + string.digits \
746 ) for a in range(0,6) \
747 ]) + "@" + socket.gethostname() + ">"
748 msg.add_header("Message-ID", messageid)
749 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
750 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
751 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
753 msg.add_header("References", prevmessageid)
754 createddate = datetime.datetime.now() \
755 .strftime("%a, %e %b %Y %T -0000")
757 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
758 .strftime("%a, %e %b %Y %T -0000")
761 msg.add_header("Date", createddate)
762 msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
763 .strftime("%a, %e %b %Y %T -0000"))
764 subj_gen = HTML2Text()
765 title = item["title"]
766 title = re.sub(u'<', u'<', title)
767 title = re.sub(u'>', u'>', title)
768 subj_gen.feed(title.encode("utf-8"))
769 msg.add_header("Subject", subj_gen.gettext())
770 msg.set_default_type("text/plain")
772 htmlcontent = content.encode("utf-8")
773 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
777 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
778 textparser = HTML2Text()
779 textparser.feed(content.encode("utf-8"))
780 textcontent = textparser.gettext()
781 textcontent = "%s\n\nItem URL: %s" %( \
784 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
788 # start by working out the filename we should be writting to, we do
789 # this following the normal maildir style rules
790 fname = str(os.getpid()) \
791 + "." + socket.gethostname() \
794 string.ascii_letters + string.digits \
795 ) for a in range(0,10) \
797 + datetime.datetime.now().strftime('%s')
798 fn = os.path.join(maildir, "tmp", fname)
800 fh.write(msg.as_string())
802 # now move it in to the new directory
803 newfn = os.path.join(maildir, "new", fname)
807 # now add to the database about the item
809 messageid = prevmessageid + " " + messageid
810 if item.has_key("guid") and item["guid"] != item["link"]:
811 data = urllib.urlencode(( \
812 ("message-id", messageid), \
813 ("created", createddate), \
814 ("contentmd5", md5sum) \
816 db[db_guid_key] = data
818 data = db[db_link_key]
819 data = cgi.parse_qs(data)
820 newdata = urllib.urlencode(( \
821 ("message-id", messageid), \
822 ("created", data["created"][0]), \
823 ("contentmd5", data["contentmd5"][0]) \
825 db[db_link_key] = newdata
827 db[db_link_key] = data
829 data = urllib.urlencode(( \
830 ("message-id", messageid), \
831 ("created", createddate), \
832 ("contentmd5", md5sum) \
834 db[db_link_key] = data
838 for header in headers:
840 ["content-md5", "etag", "last-modified", "content-length"]:
841 data.append((header[0], header[1]))
843 data = urllib.urlencode(data)
849 if __name__ == "__main__":
850 # This only gets executed if we really called the program
851 # first off, parse the command line arguments
853 oparser = OptionParser()
855 "-c", "--conf", dest="conf",
856 help="location of config file"
859 "-s", "--statedir", dest="statedir",
860 help="location of directory to store state in"
863 (options, args) = oparser.parse_args()
865 # check for the configfile
869 if options.conf != None:
870 # does the file exist?
872 os.stat(options.conf)
873 configfile = options.conf
875 # should exit here as the specified file doesn't exist
877 "Config file %s does not exist. Exiting.\n" %(options.conf,))
880 # check through the default locations
882 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
883 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
886 os.stat("/etc/rss2maildir.conf")
887 configfile = "/etc/rss2maildir.conf"
889 sys.stderr.write("No config file found. Exiting.\n")
892 # Right - if we've got this far, we've got a config file, now for the hard
895 scp = SafeConfigParser()
898 maildir_root = "RSSMaildir"
901 if options.statedir != None:
902 state_dir = options.statedir
904 mode = os.stat(state_dir)[stat.ST_MODE]
905 if not stat.S_ISDIR(mode):
907 "State directory (%s) is not a directory\n" %(state_dir))
910 # try to make the directory
914 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
916 elif scp.has_option("general", "state_dir"):
917 new_state_dir = scp.get("general", "state_dir")
919 mode = os.stat(new_state_dir)[stat.ST_MODE]
920 if not stat.S_ISDIR(mode):
922 "State directory (%s) is not a directory\n" %(state_dir))
925 state_dir = new_state_dir
929 os.mkdir(new_state_dir)
930 state_dir = new_state_dir
933 "Couldn't create state directory %s\n" %(new_state_dir))
937 mode = os.stat(state_dir)[stat.ST_MODE]
938 if not stat.S_ISDIR(mode):
940 "State directory %s is not a directory\n" %(state_dir))
947 "State directory %s could not be created\n" %(state_dir))
950 if scp.has_option("general", "maildir_root"):
951 maildir_root = scp.get("general", "maildir_root")
954 mode = os.stat(maildir_root)[stat.ST_MODE]
955 if not stat.S_ISDIR(mode):
957 "Maildir Root %s is not a directory\n" \
962 os.mkdir(maildir_root)
964 sys.stderr.write("Couldn't create Maildir Root %s\n" \
968 feeds = scp.sections()
970 feeds.remove("general")
974 for section in feeds:
975 # check if the directory exists
978 maildir = scp.get(section, "maildir")
982 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
983 maildir = os.path.join(maildir_root, maildir)
986 exists = os.stat(maildir)
987 if stat.S_ISDIR(exists[stat.ST_MODE]):
988 # check if there's a new, cur and tmp directory
990 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
992 os.mkdir(os.path.join(maildir, "cur"))
993 if not stat.S_ISDIR(mode):
994 sys.stderr.write("Broken maildir: %s\n" %(maildir))
996 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
998 os.mkdir(os.path.join(maildir, "tmp"))
999 if not stat.S_ISDIR(mode):
1000 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1002 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
1003 if not stat.S_ISDIR(mode):
1004 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1006 os.mkdir(os.path.join(maildir, "new"))
1008 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1013 sys.stderr.write("Couldn't create root maildir %s\n" \
1017 os.mkdir(os.path.join(maildir, "new"))
1018 os.mkdir(os.path.join(maildir, "cur"))
1019 os.mkdir(os.path.join(maildir, "tmp"))
1022 "Couldn't create required maildir directories for %s\n" \
1026 # right - we've got the directories, we've got the section, we know the
1029 parse_and_deliver(maildir, section, state_dir)