4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
53 from HTMLParser import HTMLParser
55 class HTML2Text(HTMLParser):
213 def __init__(self,textwidth=70):
216 self.textwidth = textwidth
219 self.ignorenodata = False
223 HTMLParser.__init__(self)
225 def handle_starttag(self, tag, attrs):
226 tag_name = tag.lower()
227 if tag_name in self.blockleveltags:
228 # handle starting a new block - unless we're in a block element
229 # that can contain other blocks, we'll assume that we want to close
231 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232 self.handle_curdata()
234 if tag_name == u'ol':
235 self.handle_curdata()
236 self.listcount.append(1)
237 self.listlevel = len(self.listcount) - 1
239 if tag_name == u'dl':
240 self.indentlevel = self.indentlevel + 4
242 if tag_name in self.liststarttags:
243 smallist = self.opentags[-3:-1]
245 for prev_listtag in smallist:
246 if prev_listtag in [u'dl', u'ol']:
247 self.indentlevel = self.indentlevel + 4
249 elif prev_listtag == u'ul':
250 self.indentlevel = self.indentlevel + 3
253 if len(self.opentags) > 0:
254 self.handle_curdata()
255 if tag_name not in self.cancontainflow:
257 self.opentags.append(tag_name)
259 if tag_name == "span":
263 listcount = self.listcount[-1]
267 if tag_name == u'dd' and len(self.opentags) > 1 \
268 and self.opentags[-1] == u'dt':
269 self.handle_curdata()
271 elif tag_name == u'dt' and len(self.opentags) > 1 \
272 and self.opentags[-1] == u'dd':
273 self.handle_curdata()
275 elif tag_name == u'a':
277 if attr[0].lower() == u'href':
278 self.urls.append(attr[1].decode('utf-8'))
279 self.curdata = self.curdata + u'`'
280 self.opentags.append(tag_name)
282 elif tag_name == u'img':
283 self.handle_image(attrs)
285 elif tag_name == u'br':
289 # we don't know the tag, so lets avoid handling it!
292 def handle_startendtag(self, tag, attrs):
293 if tag.lower() == u'br':
295 elif tag.lower() == u'img':
296 self.handle_image(attrs)
300 self.handle_curdata()
301 self.opentags.append(u'br')
302 self.handle_curdata()
305 def handle_image(self, attrs):
310 alt = attr[1].decode('utf-8')
311 elif attr[0] == 'src':
312 url = attr[1].decode('utf-8')
315 if self.images.has_key(alt):
316 if self.images[alt]["url"] == url:
317 self.curdata = self.curdata \
320 while self.images.has_key(alt):
322 self.images[alt] = {"url": url}
323 self.curdata = self.curdata \
326 self.images[alt] = {"url": url}
327 self.curdata = self.curdata \
330 if self.images.has_key(url):
331 self.curdata = self.curdata \
334 self.images[url] = {}
335 self.images[url]["url"] =url
336 self.curdata = self.curdata \
339 def handle_curdata(self):
341 if len(self.opentags) == 0:
344 tag_thats_done = self.opentags[-1]
346 if len(self.curdata) == 0:
349 if tag_thats_done == u'br':
350 if len(self.text) == 0 or self.text[-1] != '\n':
351 self.text = self.text + '\n'
352 self.ignorenodata = True
355 if len(self.curdata.strip()) == 0:
358 if tag_thats_done in self.blockleveltags:
359 newlinerequired = self.text != u''
360 if self.ignorenodata:
361 newlinerequired = False
362 self.ignorenodata = False
364 if tag_thats_done in [u'dt', u'dd', u'li'] \
365 and len(self.text) > 1 \
366 and self.text[-1] != u'\n':
367 self.text = self.text + u'\n'
368 elif len(self.text) > 2 \
369 and self.text[-1] != u'\n' \
370 and self.text[-2] != u'\n':
371 self.text = self.text + u'\n\n'
373 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
376 headingtext = " ".join(self.curdata.split())
377 seperator = u'\n' + u' '*self.indentlevel
378 headingtext = seperator.join( \
381 self.textwidth - self.indentlevel \
385 if tag_thats_done == u'h2':
387 elif tag_thats_done != u'h1':
390 if u'\n' in headingtext:
391 underline = u' ' * self.indentlevel \
392 + underlinechar * (self.textwidth - self.indentlevel)
394 underline = u' ' * self.indentlevel \
395 + underlinechar * len(headingtext)
396 self.text = self.text \
397 + headingtext + u'\n' \
399 elif tag_thats_done in [u'p', u'div']:
400 paragraph = unicode( \
401 " ".join(self.curdata.strip().encode("utf-8").split()), \
403 seperator = u'\n' + u' ' * self.indentlevel
404 self.text = self.text \
405 + u' ' * self.indentlevel \
408 paragraph, self.textwidth - self.indentlevel))
409 elif tag_thats_done == "pre":
410 self.text = self.text + unicode( \
411 self.curdata.encode("utf-8"), "utf-8")
412 elif tag_thats_done == u'blockquote':
414 " ".join(self.curdata.encode("utf-8").strip().split()), \
416 seperator = u'\n' + u' ' * self.indentlevel + u' '
417 if len(self.text) > 0 and self.text[-1] != u'\n':
418 self.text = self.text + u'\n'
419 self.text = self.text \
424 self.textwidth - self.indentlevel - 2 \
428 elif tag_thats_done == "li":
429 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
430 if len(self.text) > 0 and self.text[-1] != u'\n':
431 self.text = self.text + u'\n'
432 # work out if we're in an ol rather than a ul
433 latesttags = self.opentags[-4:]
436 for thing in latesttags:
450 listmarker = u' %2d. ' %(self.listcount[-1])
451 self.listcount[-1] = self.listcount[-1] + 1
454 + u' ' * self.indentlevel \
456 self.text = self.text \
457 + u' ' * self.indentlevel \
462 self.textwidth - self.indentlevel - listindent \
466 elif tag_thats_done == u'dt':
467 definition = unicode(" ".join( \
468 self.curdata.encode("utf-8").strip().split()), \
470 if len(self.text) > 0 and self.text[-1] != u'\n':
471 self.text = self.text + u'\n\n'
472 elif len(self.text) > 1 and self.text[-2] != u'\n':
473 self.text = self.text + u'\n'
474 definition = u' ' * (self.indentlevel - 4) + definition + "::"
475 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
476 self.text = self.text \
478 textwrap.wrap(definition, \
479 self.textwidth - self.indentlevel - 4))
481 elif tag_thats_done == u'dd':
482 definition = unicode(" ".join( \
483 self.curdata.encode("utf-8").strip().split()),
485 if len(definition) > 0:
486 if len(self.text) > 0 and self.text[-1] != u'\n':
487 self.text = self.text + u'\n'
488 indentstring = u'\n' + u' ' * self.indentlevel
489 self.text = self.text \
491 + indentstring.join( \
494 self.textwidth - self.indentlevel \
498 elif tag_thats_done == u'a':
499 self.curdata = self.curdata + u'`__'
501 elif tag_thats_done in self.liststarttags:
504 if tag_thats_done in self.blockleveltags:
507 self.ignorenodata = False
509 def handle_endtag(self, tag):
510 self.ignorenodata = False
515 tagindex = self.opentags.index(tag)
520 if tag in [u'br', u'img']:
524 self.indentlevel = self.indentlevel - 4
526 if tag in self.liststarttags:
527 if tag in [u'ol', u'dl', u'ul', u'dd']:
528 self.handle_curdata()
529 # find if there was a previous list level
530 smalllist = self.opentags[:-1]
532 for prev_listtag in smalllist:
533 if prev_listtag in [u'ol', u'dl']:
534 self.indentlevel = self.indentlevel - 4
536 elif prev_listtag == u'ul':
537 self.indentlevel = self.indentlevel - 3
541 self.listcount = self.listcount[:-1]
543 while tagindex < len(self.opentags) \
544 and tag in self.opentags[tagindex+1:]:
546 tagindex = self.opentags.index(tag, tagindex+1)
548 # well, we don't want to do that then
550 if tagindex != len(self.opentags) - 1:
551 # Assuming the data was for the last opened tag first
552 self.handle_curdata()
553 # Now kill the list to be a slice before this tag was opened
554 self.opentags = self.opentags[:tagindex + 1]
556 self.handle_curdata()
557 if self.opentags[-1] == tag:
560 def handle_data(self, data):
561 if len(self.opentags) == 0:
562 self.opentags.append(u'p')
563 self.curdata = self.curdata + data.decode("utf-8")
565 def handle_charref(self, name):
567 entity = unichr(int(name))
571 entity = unichr(int('0%s' %(name,), 16))
573 entity = u'#%s' %(name,)
575 entity = u'#%s' %(name,)
576 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
579 def handle_entityref(self, name):
581 if HTML2Text.entities.has_key(name):
582 entity = HTML2Text.entities[name]
584 entity = "&" + name + ";"
586 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
590 self.handle_curdata()
591 if len(self.text) == 0 or self.text[-1] != u'\n':
592 self.text = self.text + u'\n'
594 if len(self.text) > 0:
595 while len(self.text) > 1 and self.text[-1] == u'\n':
596 self.text = self.text[:-1]
597 self.text = self.text + u'\n'
598 if len(self.urls) > 0:
599 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
601 if len(self.images.keys()) > 0:
602 self.text = self.text + u'\n.. ' \
604 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
605 for a in self.images.keys()]) + u'\n'
609 def open_url(method, url):
611 while redirectcount < 3:
612 (type, rest) = urllib.splittype(url)
613 (host, path) = urllib.splithost(rest)
614 (host, port) = urllib.splitport(host)
623 conn = httplib.HTTPConnection("%s:%s" %(host, port))
625 conn = httplib.HTTPSConnection("%s:%s" %(host, port))
626 conn.request(method, path)
627 response = conn.getresponse()
628 if response.status in [301, 302, 303, 307]:
629 headers = response.getheaders()
630 for header in headers:
631 if header[0] == "location":
633 elif response.status == 200:
637 redirectcount = redirectcount + 1
640 def parse_and_deliver(maildir, url, statedir):
643 # first check if we know about this feed already
644 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
645 if feeddb.has_key(url):
647 data = cgi.parse_qs(data)
648 response = open_url("HEAD", url)
651 headers = response.getheaders()
654 for header in headers:
655 if header[0] == "content-length":
656 if header[1] != data["content-length"][0]:
658 elif header[0] == "etag":
659 if header[1] != data["etag"][0]:
661 elif header[0] == "last-modified":
662 if header[1] != data["last-modified"][0]:
664 elif header[0] == "content-md5":
665 if header[1] != data["content-md5"][0]:
670 response = open_url("GET", url)
672 headers = response.getheaders()
673 feedhandle = response
675 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
678 return # don't need to do anything, nothings changed.
680 response = open_url("GET", url)
682 headers = response.getheaders()
683 feedhandle = response
685 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
688 fp = feedparser.parse(feedhandle)
689 db = dbm.open(os.path.join(statedir, "seen"), "c")
690 for item in fp["items"]:
691 # have we seen it before?
692 # need to work out what the content is first...
694 if item.has_key("content"):
695 content = item["content"][0]["value"]
697 if item.has_key("description"):
698 content = item["description"]
702 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
707 db_link_key = (url + u'|' + item["link"]).encode("utf-8")
709 # check if there's a guid too - if that exists and we match the md5,
711 if item.has_key("guid"):
712 db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
713 if db.has_key(db_guid_key):
714 data = db[db_guid_key]
715 data = cgi.parse_qs(data)
716 if data["contentmd5"][0] == md5sum:
719 if db.has_key(db_link_key):
720 data = db[db_link_key]
721 data = cgi.parse_qs(data)
722 if data.has_key("message-id"):
723 prevmessageid = data["message-id"][0]
724 if data["contentmd5"][0] == md5sum:
728 author = item["author"]
732 # create a basic email message
733 msg = MIMEMultipart("alternative")
735 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
739 string.ascii_letters + string.digits \
740 ) for a in range(0,6) \
741 ]) + "@" + socket.gethostname() + ">"
742 msg.add_header("Message-ID", messageid)
743 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
744 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
745 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
747 msg.add_header("References", prevmessageid)
748 createddate = datetime.datetime.now() \
749 .strftime("%a, %e %b %Y %T -0000")
751 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
752 .strftime("%a, %e %b %Y %T -0000")
755 msg.add_header("Date", createddate)
756 msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
757 .strftime("%a, %e %b %Y %T -0000"))
758 subj_gen = HTML2Text()
759 title = item["title"]
760 title = re.sub(u'<', u'<', title)
761 title = re.sub(u'>', u'>', title)
762 subj_gen.feed(title.encode("utf-8"))
763 msg.add_header("Subject", subj_gen.gettext())
764 msg.set_default_type("text/plain")
766 htmlcontent = content.encode("utf-8")
767 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
771 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
772 textparser = HTML2Text()
773 textparser.feed(content.encode("utf-8"))
774 textcontent = textparser.gettext()
775 textcontent = "%s\n\nItem URL: %s" %( \
778 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
782 # start by working out the filename we should be writting to, we do
783 # this following the normal maildir style rules
784 fname = str(os.getpid()) \
785 + "." + socket.gethostname() \
788 string.ascii_letters + string.digits \
789 ) for a in range(0,10) \
791 + datetime.datetime.now().strftime('%s')
792 fn = os.path.join(maildir, "tmp", fname)
794 fh.write(msg.as_string())
796 # now move it in to the new directory
797 newfn = os.path.join(maildir, "new", fname)
801 # now add to the database about the item
803 messageid = prevmessageid + " " + messageid
804 if item.has_key("guid") and item["guid"] != item["link"]:
805 data = urllib.urlencode(( \
806 ("message-id", messageid), \
807 ("created", createddate), \
808 ("contentmd5", md5sum) \
810 db[db_guid_key] = data
812 data = db[db_link_key]
813 data = cgi.parse_qs(data)
814 newdata = urllib.urlencode(( \
815 ("message-id", messageid), \
816 ("created", data["created"][0]), \
817 ("contentmd5", data["contentmd5"][0]) \
819 db[db_link_key] = newdata
821 db[db_link_key] = data
823 data = urllib.urlencode(( \
824 ("message-id", messageid), \
825 ("created", createddate), \
826 ("contentmd5", md5sum) \
828 db[db_link_key] = data
832 for header in headers:
834 ["content-md5", "etag", "last-modified", "content-length"]:
835 data.append((header[0], header[1]))
837 data = urllib.urlencode(data)
843 if __name__ == "__main__":
844 # This only gets executed if we really called the program
845 # first off, parse the command line arguments
847 oparser = OptionParser()
849 "-c", "--conf", dest="conf",
850 help="location of config file"
853 "-s", "--statedir", dest="statedir",
854 help="location of directory to store state in"
857 (options, args) = oparser.parse_args()
859 # check for the configfile
863 if options.conf != None:
864 # does the file exist?
866 os.stat(options.conf)
867 configfile = options.conf
869 # should exit here as the specified file doesn't exist
871 "Config file %s does not exist. Exiting.\n" %(options.conf,))
874 # check through the default locations
876 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
877 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
880 os.stat("/etc/rss2maildir.conf")
881 configfile = "/etc/rss2maildir.conf"
883 sys.stderr.write("No config file found. Exiting.\n")
886 # Right - if we've got this far, we've got a config file, now for the hard
889 scp = SafeConfigParser()
892 maildir_root = "RSSMaildir"
895 if options.statedir != None:
896 state_dir = options.statedir
898 mode = os.stat(state_dir)[stat.ST_MODE]
899 if not stat.S_ISDIR(mode):
901 "State directory (%s) is not a directory\n" %(state_dir))
904 # try to make the directory
908 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
910 elif scp.has_option("general", "state_dir"):
911 new_state_dir = scp.get("general", "state_dir")
913 mode = os.stat(new_state_dir)[stat.ST_MODE]
914 if not stat.S_ISDIR(mode):
916 "State directory (%s) is not a directory\n" %(state_dir))
919 state_dir = new_state_dir
923 os.mkdir(new_state_dir)
924 state_dir = new_state_dir
927 "Couldn't create state directory %s\n" %(new_state_dir))
931 mode = os.stat(state_dir)[stat.ST_MODE]
932 if not stat.S_ISDIR(mode):
934 "State directory %s is not a directory\n" %(state_dir))
941 "State directory %s could not be created\n" %(state_dir))
944 if scp.has_option("general", "maildir_root"):
945 maildir_root = scp.get("general", "maildir_root")
948 mode = os.stat(maildir_root)[stat.ST_MODE]
949 if not stat.S_ISDIR(mode):
951 "Maildir Root %s is not a directory\n" \
956 os.mkdir(maildir_root)
958 sys.stderr.write("Couldn't create Maildir Root %s\n" \
962 feeds = scp.sections()
964 feeds.remove("general")
968 for section in feeds:
969 # check if the directory exists
972 maildir = scp.get(section, "maildir")
976 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
977 maildir = os.path.join(maildir_root, maildir)
980 exists = os.stat(maildir)
981 if stat.S_ISDIR(exists[stat.ST_MODE]):
982 # check if there's a new, cur and tmp directory
984 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
986 os.mkdir(os.path.join(maildir, "cur"))
987 if not stat.S_ISDIR(mode):
988 sys.stderr.write("Broken maildir: %s\n" %(maildir))
990 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
992 os.mkdir(os.path.join(maildir, "tmp"))
993 if not stat.S_ISDIR(mode):
994 sys.stderr.write("Broken maildir: %s\n" %(maildir))
996 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
997 if not stat.S_ISDIR(mode):
998 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1000 os.mkdir(os.path.join(maildir, "new"))
1002 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1007 sys.stderr.write("Couldn't create root maildir %s\n" \
1011 os.mkdir(os.path.join(maildir, "new"))
1012 os.mkdir(os.path.join(maildir, "cur"))
1013 os.mkdir(os.path.join(maildir, "tmp"))
1016 "Couldn't create required maildir directories for %s\n" \
1020 # right - we've got the directories, we've got the section, we know the
1023 parse_and_deliver(maildir, section, state_dir)