4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
43 if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
53 from HTMLParser import HTMLParser
55 class HTML2Text(HTMLParser):
213 def __init__(self,textwidth=70):
216 self.textwidth = textwidth
219 self.ignorenodata = False
223 HTMLParser.__init__(self)
225 def handle_starttag(self, tag, attrs):
226 tag_name = tag.lower()
227 if tag_name in self.blockleveltags:
228 # handle starting a new block - unless we're in a block element
229 # that can contain other blocks, we'll assume that we want to close
231 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
232 self.handle_curdata()
234 if tag_name == u'ol':
235 self.handle_curdata()
236 self.listcount.append(1)
237 self.listlevel = len(self.listcount) - 1
239 if tag_name == u'dl':
240 self.indentlevel = self.indentlevel + 4
242 if tag_name in self.liststarttags:
243 smallist = self.opentags[-3:-1]
245 for prev_listtag in smallist:
246 if prev_listtag in [u'dl', u'ol']:
247 self.indentlevel = self.indentlevel + 4
249 elif prev_listtag == u'ul':
250 self.indentlevel = self.indentlevel + 3
253 if len(self.opentags) > 0:
254 self.handle_curdata()
255 if tag_name not in self.cancontainflow:
257 self.opentags.append(tag_name)
259 if tag_name == "span":
263 listcount = self.listcount[-1]
267 if tag_name == u'dd' and len(self.opentags) > 1 \
268 and self.opentags[-1] == u'dt':
269 self.handle_curdata()
271 elif tag_name == u'dt' and len(self.opentags) > 1 \
272 and self.opentags[-1] == u'dd':
273 self.handle_curdata()
275 elif tag_name == u'a':
277 if attr[0].lower() == u'href':
278 self.urls.append(attr[1].decode('utf-8'))
279 self.curdata = self.curdata + u'`'
280 self.opentags.append(tag_name)
282 elif tag_name == u'img':
283 self.handle_image(attrs)
285 elif tag_name == u'br':
289 # we don't know the tag, so lets avoid handling it!
292 def handle_startendtag(self, tag, attrs):
293 if tag.lower() == u'br':
295 elif tag.lower() == u'img':
296 self.handle_image(attrs)
300 self.handle_curdata()
301 self.opentags.append(u'br')
302 self.handle_curdata()
305 def handle_image(self, attrs):
310 alt = attr[1].decode('utf-8')
311 elif attr[0] == 'src':
312 url = attr[1].decode('utf-8')
315 if self.images.has_key(alt):
316 if self.images[alt]["url"] == url:
317 self.curdata = self.curdata \
320 while self.images.has_key(alt):
322 self.images[alt] = {"url": url}
323 self.curdata = self.curdata \
326 self.images[alt] = {"url": url}
327 self.curdata = self.curdata \
330 if self.images.has_key(url):
331 self.curdata = self.curdata \
334 self.images[url] = {}
335 self.images[url]["url"] =url
336 self.curdata = self.curdata \
339 def handle_curdata(self):
341 if len(self.opentags) == 0:
344 tag_thats_done = self.opentags[-1]
346 if len(self.curdata) == 0:
349 if tag_thats_done == u'br':
350 if len(self.text) == 0 or self.text[-1] != '\n':
351 self.text = self.text + '\n'
352 self.ignorenodata = True
355 if len(self.curdata.strip()) == 0:
358 if tag_thats_done in self.blockleveltags:
359 newlinerequired = self.text != u''
360 if self.ignorenodata:
361 newlinerequired = False
362 self.ignorenodata = False
364 if tag_thats_done in [u'dt', u'dd', u'li'] \
365 and len(self.text) > 1 \
366 and self.text[-1] != u'\n':
367 self.text = self.text + u'\n'
368 elif len(self.text) > 2 \
369 and self.text[-1] != u'\n' \
370 and self.text[-2] != u'\n':
371 self.text = self.text + u'\n\n'
373 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
376 headingtext = " ".join(self.curdata.split())
377 seperator = u'\n' + u' '*self.indentlevel
378 headingtext = seperator.join( \
381 self.textwidth - self.indentlevel \
385 if tag_thats_done == u'h2':
387 elif tag_thats_done != u'h1':
390 if u'\n' in headingtext:
391 underline = u' ' * self.indentlevel \
392 + underlinechar * (self.textwidth - self.indentlevel)
394 underline = u' ' * self.indentlevel \
395 + underlinechar * len(headingtext)
396 self.text = self.text \
397 + headingtext + u'\n' \
399 elif tag_thats_done in [u'p', u'div']:
400 paragraph = unicode( \
401 " ".join(self.curdata.strip().encode("utf-8").split()), \
403 seperator = u'\n' + u' ' * self.indentlevel
404 self.text = self.text \
405 + u' ' * self.indentlevel \
408 paragraph, self.textwidth - self.indentlevel))
409 elif tag_thats_done == "pre":
410 self.text = self.text + unicode( \
411 self.curdata.encode("utf-8"), "utf-8")
412 elif tag_thats_done == u'blockquote':
414 " ".join(self.curdata.encode("utf-8").strip().split()), \
416 seperator = u'\n' + u' ' * self.indentlevel + u' '
417 if len(self.text) > 0 and self.text[-1] != u'\n':
418 self.text = self.text + u'\n'
419 self.text = self.text \
424 self.textwidth - self.indentlevel - 2 \
428 elif tag_thats_done == "li":
429 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
430 if len(self.text) > 0 and self.text[-1] != u'\n':
431 self.text = self.text + u'\n'
432 # work out if we're in an ol rather than a ul
433 latesttags = self.opentags[-4:]
436 for thing in latesttags:
450 listmarker = u' %2d. ' %(self.listcount[-1])
451 self.listcount[-1] = self.listcount[-1] + 1
454 + u' ' * self.indentlevel \
456 self.text = self.text \
457 + u' ' * self.indentlevel \
462 self.textwidth - self.indentlevel - listindent \
466 elif tag_thats_done == u'dt':
467 definition = unicode(" ".join( \
468 self.curdata.encode("utf-8").strip().split()), \
470 if len(self.text) > 0 and self.text[-1] != u'\n':
471 self.text = self.text + u'\n\n'
472 elif len(self.text) > 1 and self.text[-2] != u'\n':
473 self.text = self.text + u'\n'
474 definition = u' ' * (self.indentlevel - 4) + definition + "::"
475 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
476 self.text = self.text \
478 textwrap.wrap(definition, \
479 self.textwidth - self.indentlevel - 4))
481 elif tag_thats_done == u'dd':
482 definition = unicode(" ".join( \
483 self.curdata.encode("utf-8").strip().split()),
485 if len(definition) > 0:
486 if len(self.text) > 0 and self.text[-1] != u'\n':
487 self.text = self.text + u'\n'
488 indentstring = u'\n' + u' ' * self.indentlevel
489 self.text = self.text \
491 + indentstring.join( \
494 self.textwidth - self.indentlevel \
498 elif tag_thats_done == u'a':
499 self.curdata = self.curdata + u'`__'
501 elif tag_thats_done in self.liststarttags:
504 if tag_thats_done in self.blockleveltags:
507 self.ignorenodata = False
509 def handle_endtag(self, tag):
510 self.ignorenodata = False
515 tagindex = self.opentags.index(tag)
520 if tag in [u'br', u'img']:
524 self.indentlevel = self.indentlevel - 4
526 if tag in self.liststarttags:
527 if tag in [u'ol', u'dl', u'ul', u'dd']:
528 self.handle_curdata()
529 # find if there was a previous list level
530 smalllist = self.opentags[:-1]
532 for prev_listtag in smalllist:
533 if prev_listtag in [u'ol', u'dl']:
534 self.indentlevel = self.indentlevel - 4
536 elif prev_listtag == u'ul':
537 self.indentlevel = self.indentlevel - 3
541 self.listcount = self.listcount[:-1]
543 while tagindex < len(self.opentags) \
544 and tag in self.opentags[tagindex+1:]:
546 tagindex = self.opentags.index(tag, tagindex+1)
548 # well, we don't want to do that then
550 if tagindex != len(self.opentags) - 1:
551 # Assuming the data was for the last opened tag first
552 self.handle_curdata()
553 # Now kill the list to be a slice before this tag was opened
554 self.opentags = self.opentags[:tagindex + 1]
556 self.handle_curdata()
557 if self.opentags[-1] == tag:
560 def handle_data(self, data):
561 if len(self.opentags) == 0:
562 self.opentags.append(u'p')
563 self.curdata = self.curdata + data.decode("utf-8")
565 def handle_charref(self, name):
567 entity = unichr(int(name))
571 entity = unichr(int('0%s' %(name,), 16))
573 entity = u'#%s' %(name,)
575 entity = u'#%s' %(name,)
576 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
579 def handle_entityref(self, name):
581 if HTML2Text.entities.has_key(name):
582 entity = HTML2Text.entities[name]
584 entity = "&" + name + ";"
586 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
590 self.handle_curdata()
591 if len(self.text) == 0 or self.text[-1] != u'\n':
592 self.text = self.text + u'\n'
594 if len(self.text) > 0:
595 while len(self.text) > 1 and self.text[-1] == u'\n':
596 self.text = self.text[:-1]
597 self.text = self.text + u'\n'
598 if len(self.urls) > 0:
599 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
601 if len(self.images.keys()) > 0:
602 self.text = self.text + u'\n.. ' \
604 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
605 for a in self.images.keys()]) + u'\n'
609 def open_url(method, url):
611 while redirectcount < 3:
612 (type, rest) = urllib.splittype(url)
613 (host, path) = urllib.splithost(rest)
614 (host, port) = urllib.splitport(host)
618 conn = httplib.HTTPConnection("%s:%s" %(host, port))
619 conn.request(method, path)
620 response = conn.getresponse()
621 if response.status in [301, 302, 303, 307]:
622 headers = response.getheaders()
623 for header in headers:
624 if header[0] == "location":
626 elif response.status == 200:
630 redirectcount = redirectcount + 1
633 def parse_and_deliver(maildir, url, statedir):
636 # first check if we know about this feed already
637 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
638 if feeddb.has_key(url):
640 data = cgi.parse_qs(data)
641 response = open_url("HEAD", url)
644 headers = response.getheaders()
647 for header in headers:
648 if header[0] == "content-length":
649 if header[1] != data["content-length"][0]:
651 elif header[0] == "etag":
652 if header[1] != data["etag"][0]:
654 elif header[0] == "last-modified":
655 if header[1] != data["last-modified"][0]:
657 elif header[0] == "content-md5":
658 if header[1] != data["content-md5"][0]:
663 response = open_url("GET", url)
665 headers = response.getheaders()
666 feedhandle = response
668 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
671 return # don't need to do anything, nothings changed.
673 response = open_url("GET", url)
675 headers = response.getheaders()
676 feedhandle = response
678 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
681 fp = feedparser.parse(feedhandle)
682 db = dbm.open(os.path.join(statedir, "seen"), "c")
683 for item in fp["items"]:
684 # have we seen it before?
685 # need to work out what the content is first...
687 if item.has_key("content"):
688 content = item["content"][0]["value"]
690 if item.has_key("description"):
691 content = item["description"]
695 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
700 db_link_key = (url + u'|' + item["link"]).encode("utf-8")
702 # check if there's a guid too - if that exists and we match the md5,
704 if item.has_key("guid"):
705 db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
706 if db.has_key(db_guid_key):
707 data = db[db_guid_key]
708 data = cgi.parse_qs(data)
709 if data["contentmd5"][0] == md5sum:
712 if db.has_key(db_link_key):
713 data = db[db_link_key]
714 data = cgi.parse_qs(data)
715 if data.has_key("message-id"):
716 prevmessageid = data["message-id"][0]
717 if data["contentmd5"][0] == md5sum:
721 author = item["author"]
725 # create a basic email message
726 msg = MIMEMultipart("alternative")
728 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
732 string.ascii_letters + string.digits \
733 ) for a in range(0,6) \
734 ]) + "@" + socket.gethostname() + ">"
735 msg.add_header("Message-ID", messageid)
736 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
737 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
738 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
740 msg.add_header("References", prevmessageid)
741 createddate = datetime.datetime.now() \
742 .strftime("%a, %e %b %Y %T -0000")
744 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
745 .strftime("%a, %e %b %Y %T -0000")
748 msg.add_header("Date", createddate)
749 msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
750 .strftime("%a, %e %b %Y %T -0000"))
751 subj_gen = HTML2Text()
752 title = item["title"]
753 title = re.sub(u'<', u'<', title)
754 title = re.sub(u'>', u'>', title)
755 subj_gen.feed(title.encode("utf-8"))
756 msg.add_header("Subject", subj_gen.gettext())
757 msg.set_default_type("text/plain")
759 htmlcontent = content.encode("utf-8")
760 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
764 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
765 textparser = HTML2Text()
766 textparser.feed(content.encode("utf-8"))
767 textcontent = textparser.gettext()
768 textcontent = "%s\n\nItem URL: %s" %( \
771 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
775 # start by working out the filename we should be writting to, we do
776 # this following the normal maildir style rules
777 fname = str(os.getpid()) \
778 + "." + socket.gethostname() \
781 string.ascii_letters + string.digits \
782 ) for a in range(0,10) \
784 + datetime.datetime.now().strftime('%s')
785 fn = os.path.join(maildir, "tmp", fname)
787 fh.write(msg.as_string())
789 # now move it in to the new directory
790 newfn = os.path.join(maildir, "new", fname)
794 # now add to the database about the item
796 messageid = prevmessageid + " " + messageid
797 if item.has_key("guid") and item["guid"] != item["link"]:
798 data = urllib.urlencode(( \
799 ("message-id", messageid), \
800 ("created", createddate), \
801 ("contentmd5", md5sum) \
803 db[db_guid_key] = data
805 data = db[db_link_key]
806 data = cgi.parse_qs(data)
807 newdata = urllib.urlencode(( \
808 ("message-id", messageid), \
809 ("created", data["created"][0]), \
810 ("contentmd5", data["contentmd5"][0]) \
812 db[db_link_key] = newdata
814 db[db_link_key] = data
816 data = urllib.urlencode(( \
817 ("message-id", messageid), \
818 ("created", createddate), \
819 ("contentmd5", md5sum) \
821 db[db_link_key] = data
825 for header in headers:
827 ["content-md5", "etag", "last-modified", "content-length"]:
828 data.append((header[0], header[1]))
830 data = urllib.urlencode(data)
836 if __name__ == "__main__":
837 # This only gets executed if we really called the program
838 # first off, parse the command line arguments
840 oparser = OptionParser()
842 "-c", "--conf", dest="conf",
843 help="location of config file"
846 "-s", "--statedir", dest="statedir",
847 help="location of directory to store state in"
850 (options, args) = oparser.parse_args()
852 # check for the configfile
856 if options.conf != None:
857 # does the file exist?
859 os.stat(options.conf)
860 configfile = options.conf
862 # should exit here as the specified file doesn't exist
864 "Config file %s does not exist. Exiting.\n" %(options.conf,))
867 # check through the default locations
869 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
870 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
873 os.stat("/etc/rss2maildir.conf")
874 configfile = "/etc/rss2maildir.conf"
876 sys.stderr.write("No config file found. Exiting.\n")
879 # Right - if we've got this far, we've got a config file, now for the hard
882 scp = SafeConfigParser()
885 maildir_root = "RSSMaildir"
888 if options.statedir != None:
889 state_dir = options.statedir
891 mode = os.stat(state_dir)[stat.ST_MODE]
892 if not stat.S_ISDIR(mode):
894 "State directory (%s) is not a directory\n" %(state_dir))
897 # try to make the directory
901 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
903 elif scp.has_option("general", "state_dir"):
904 new_state_dir = scp.get("general", "state_dir")
906 mode = os.stat(new_state_dir)[stat.ST_MODE]
907 if not stat.S_ISDIR(mode):
909 "State directory (%s) is not a directory\n" %(state_dir))
912 state_dir = new_state_dir
916 os.mkdir(new_state_dir)
917 state_dir = new_state_dir
920 "Couldn't create state directory %s\n" %(new_state_dir))
924 mode = os.stat(state_dir)[stat.ST_MODE]
925 if not stat.S_ISDIR(mode):
927 "State directory %s is not a directory\n" %(state_dir))
934 "State directory %s could not be created\n" %(state_dir))
937 if scp.has_option("general", "maildir_root"):
938 maildir_root = scp.get("general", "maildir_root")
941 mode = os.stat(maildir_root)[stat.ST_MODE]
942 if not stat.S_ISDIR(mode):
944 "Maildir Root %s is not a directory\n" \
949 os.mkdir(maildir_root)
951 sys.stderr.write("Couldn't create Maildir Root %s\n" \
955 feeds = scp.sections()
957 feeds.remove("general")
961 for section in feeds:
962 # check if the directory exists
965 maildir = scp.get(section, "maildir")
969 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
970 maildir = os.path.join(maildir_root, maildir)
973 exists = os.stat(maildir)
974 if stat.S_ISDIR(exists[stat.ST_MODE]):
975 # check if there's a new, cur and tmp directory
977 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
979 os.mkdir(os.path.join(maildir, "cur"))
980 if not stat.S_ISDIR(mode):
981 sys.stderr.write("Broken maildir: %s\n" %(maildir))
983 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
985 os.mkdir(os.path.join(maildir, "tmp"))
986 if not stat.S_ISDIR(mode):
987 sys.stderr.write("Broken maildir: %s\n" %(maildir))
989 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
990 if not stat.S_ISDIR(mode):
991 sys.stderr.write("Broken maildir: %s\n" %(maildir))
993 os.mkdir(os.path.join(maildir, "new"))
995 sys.stderr.write("Broken maildir: %s\n" %(maildir))
1000 sys.stderr.write("Couldn't create root maildir %s\n" \
1004 os.mkdir(os.path.join(maildir, "new"))
1005 os.mkdir(os.path.join(maildir, "cur"))
1006 os.mkdir(os.path.join(maildir, "tmp"))
1009 "Couldn't create required maildir directories for %s\n" \
1013 # right - we've got the directories, we've got the section, we know the
1016 parse_and_deliver(maildir, section, state_dir)