4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
207 def __init__(self,textwidth=70):
210 self.textwidth = textwidth
213 self.ignorenodata = False
217 HTMLParser.__init__(self)
219 def handle_starttag(self, tag, attrs):
220 tag_name = tag.lower()
221 if tag_name in self.blockleveltags:
222 # handle starting a new block - unless we're in a block element
223 # that can contain other blocks, we'll assume that we want to close
225 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226 self.handle_curdata()
228 if tag_name == u'ol':
229 self.handle_curdata()
230 self.listcount.append(1)
231 self.listlevel = len(self.listcount) - 1
233 if tag_name == u'dl':
234 self.indentlevel = self.indentlevel + 4
236 if tag_name in self.liststarttags:
237 smallist = self.opentags[-3:-1]
239 for prev_listtag in smallist:
240 if prev_listtag in [u'dl', u'ol']:
241 self.indentlevel = self.indentlevel + 4
243 elif prev_listtag == u'ul':
244 self.indentlevel = self.indentlevel + 3
247 if len(self.opentags) > 0:
248 self.handle_curdata()
249 if tag_name not in self.cancontainflow:
251 self.opentags.append(tag_name)
253 if tag_name == "span":
257 listcount = self.listcount[-1]
261 if tag_name == u'dd' and len(self.opentags) > 1 \
262 and self.opentags[-1] == u'dt':
263 self.handle_curdata()
265 elif tag_name == u'dt' and len(self.opentags) > 1 \
266 and self.opentags[-1] == u'dd':
267 self.handle_curdata()
269 elif tag_name == u'a':
271 if attr[0].lower() == u'href':
272 self.urls.append(attr[1].decode('utf-8'))
273 self.curdata = self.curdata + u'`'
274 self.opentags.append(tag_name)
276 elif tag_name == u'img':
277 self.handle_image(attrs)
279 elif tag_name == u'br':
283 # we don't know the tag, so lets avoid handling it!
286 def handle_startendtag(self, tag, attrs):
287 if tag.lower() == u'br':
289 elif tag.lower() == u'img':
290 self.handle_image(attrs)
294 self.handle_curdata()
295 self.opentags.append(u'br')
296 self.handle_curdata()
299 def handle_image(self, attrs):
304 alt = attr[1].decode('utf-8')
305 elif attr[0] == 'src':
306 url = attr[1].decode('utf-8')
309 if self.images.has_key(alt):
310 if self.images[alt]["url"] == url:
311 self.curdata = self.curdata \
314 while self.images.has_key(alt):
316 self.images[alt] = {"url": url}
317 self.curdata = self.curdata \
320 self.images[alt] = {"url": url}
321 self.curdata = self.curdata \
324 if self.images.has_key(url):
325 self.curdata = self.curdata \
328 self.images[url] = {}
329 self.images[url]["url"] =url
330 self.curdata = self.curdata \
333 def handle_curdata(self):
335 if len(self.opentags) == 0:
338 tag_thats_done = self.opentags[-1]
340 if len(self.curdata) == 0:
343 if tag_thats_done == u'br':
344 if len(self.text) == 0 or self.text[-1] != '\n':
345 self.text = self.text + '\n'
346 self.ignorenodata = True
349 if len(self.curdata.strip()) == 0:
352 if tag_thats_done in self.blockleveltags:
353 newlinerequired = self.text != u''
354 if self.ignorenodata:
355 newlinerequired = False
356 self.ignorenodata = False
358 if tag_thats_done in [u'dt', u'dd', u'li'] \
359 and len(self.text) > 1 \
360 and self.text[-1] != u'\n':
361 self.text = self.text + u'\n'
362 elif len(self.text) > 2 \
363 and self.text[-1] != u'\n' \
364 and self.text[-2] != u'\n':
365 self.text = self.text + u'\n\n'
367 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
370 headingtext = " ".join(self.curdata.split())
371 seperator = u'\n' + u' '*self.indentlevel
372 headingtext = seperator.join( \
375 self.textwidth - self.indentlevel \
379 if tag_thats_done == u'h2':
381 elif tag_thats_done != u'h1':
384 if u'\n' in headingtext:
385 underline = u' ' * self.indentlevel \
386 + underlinechar * (self.textwidth - self.indentlevel)
388 underline = u' ' * self.indentlevel \
389 + underlinechar * len(headingtext)
390 self.text = self.text \
391 + headingtext + u'\n' \
393 elif tag_thats_done in [u'p', u'div']:
394 paragraph = unicode( \
395 " ".join(self.curdata.strip().encode("utf-8").split()), \
397 seperator = u'\n' + u' ' * self.indentlevel
398 self.text = self.text \
399 + u' ' * self.indentlevel \
402 paragraph, self.textwidth - self.indentlevel))
403 elif tag_thats_done == "pre":
404 self.text = self.text + unicode( \
405 self.curdata.encode("utf-8"), "utf-8")
406 elif tag_thats_done == u'blockquote':
408 " ".join(self.curdata.encode("utf-8").strip().split()), \
410 seperator = u'\n' + u' ' * self.indentlevel + u' '
411 if len(self.text) > 0 and self.text[-1] != u'\n':
412 self.text = self.text + u'\n'
413 self.text = self.text \
418 self.textwidth - self.indentlevel - 2 \
422 elif tag_thats_done == "li":
423 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
424 if len(self.text) > 0 and self.text[-1] != u'\n':
425 self.text = self.text + u'\n'
426 # work out if we're in an ol rather than a ul
427 latesttags = self.opentags[-4:]
430 for thing in latesttags:
444 listmarker = u' %2d. ' %(self.listcount[-1])
445 self.listcount[-1] = self.listcount[-1] + 1
448 + u' ' * self.indentlevel \
450 self.text = self.text \
451 + u' ' * self.indentlevel \
456 self.textwidth - self.indentlevel - listindent \
460 elif tag_thats_done == u'dt':
461 definition = unicode(" ".join( \
462 self.curdata.encode("utf-8").strip().split()), \
464 if len(self.text) > 0 and self.text[-1] != u'\n':
465 self.text = self.text + u'\n\n'
466 elif len(self.text) > 1 and self.text[-2] != u'\n':
467 self.text = self.text + u'\n'
468 definition = u' ' * (self.indentlevel - 4) + definition + "::"
469 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
470 self.text = self.text \
472 textwrap.wrap(definition, \
473 self.textwidth - self.indentlevel - 4))
475 elif tag_thats_done == u'dd':
476 definition = unicode(" ".join( \
477 self.curdata.encode("utf-8").strip().split()),
479 if len(definition) > 0:
480 if len(self.text) > 0 and self.text[-1] != u'\n':
481 self.text = self.text + u'\n'
482 indentstring = u'\n' + u' ' * self.indentlevel
483 self.text = self.text \
485 + indentstring.join( \
488 self.textwidth - self.indentlevel \
492 elif tag_thats_done == u'a':
493 self.curdata = self.curdata + u'`__'
495 elif tag_thats_done in self.liststarttags:
498 if tag_thats_done in self.blockleveltags:
501 self.ignorenodata = False
503 def handle_endtag(self, tag):
504 self.ignorenodata = False
509 tagindex = self.opentags.index(tag)
514 if tag in [u'br', u'img']:
518 self.indentlevel = self.indentlevel - 4
520 if tag in self.liststarttags:
521 if tag in [u'ol', u'dl', u'ul', u'dd']:
522 self.handle_curdata()
523 # find if there was a previous list level
524 smalllist = self.opentags[:-1]
526 for prev_listtag in smalllist:
527 if prev_listtag in [u'ol', u'dl']:
528 self.indentlevel = self.indentlevel - 4
530 elif prev_listtag == u'ul':
531 self.indentlevel = self.indentlevel - 3
535 self.listcount = self.listcount[:-1]
537 while tagindex < len(self.opentags) \
538 and tag in self.opentags[tagindex+1:]:
540 tagindex = self.opentags.index(tag, tagindex+1)
542 # well, we don't want to do that then
544 if tagindex != len(self.opentags) - 1:
545 # Assuming the data was for the last opened tag first
546 self.handle_curdata()
547 # Now kill the list to be a slice before this tag was opened
548 self.opentags = self.opentags[:tagindex + 1]
550 self.handle_curdata()
551 if self.opentags[-1] == tag:
554 def handle_data(self, data):
555 if len(self.opentags) == 0:
556 self.opentags.append(u'p')
557 self.curdata = self.curdata + data.decode("utf-8")
559 def handle_entityref(self, name):
561 if HTML2Text.entities.has_key(name):
562 entity = HTML2Text.entities[name]
564 entity = unichr(int(name[1:]))
566 entity = "&" + name + ";"
568 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
572 self.handle_curdata()
573 if len(self.text) == 0 or self.text[-1] != u'\n':
574 self.text = self.text + u'\n'
576 if len(self.text) > 0:
577 while len(self.text) > 1 and self.text[-1] == u'\n':
578 self.text = self.text[:-1]
579 self.text = self.text + u'\n'
580 if len(self.urls) > 0:
581 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
583 if len(self.images.keys()) > 0:
584 self.text = self.text + u'\n.. ' \
586 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
587 for a in self.images.keys()]) + u'\n'
591 def open_url(method, url):
593 while redirectcount < 3:
594 (type, rest) = urllib.splittype(url)
595 (host, path) = urllib.splithost(rest)
596 (host, port) = urllib.splitport(host)
600 conn = httplib.HTTPConnection("%s:%s" %(host, port))
601 conn.request(method, path)
602 response = conn.getresponse()
603 if response.status in [301, 302, 303, 307]:
604 headers = response.getheaders()
605 for header in headers:
606 if header[0] == "location":
608 elif response.status == 200:
612 redirectcount = redirectcount + 1
615 def parse_and_deliver(maildir, url, statedir):
618 # first check if we know about this feed already
619 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
620 if feeddb.has_key(url):
622 data = cgi.parse_qs(data)
623 response = open_url("HEAD", url)
626 headers = response.getheaders()
629 for header in headers:
630 if header[0] == "content-length":
631 if header[1] != data["content-length"][0]:
633 elif header[0] == "etag":
634 if header[1] != data["etag"][0]:
636 elif header[0] == "last-modified":
637 if header[1] != data["last-modified"][0]:
639 elif header[0] == "content-md5":
640 if header[1] != data["content-md5"][0]:
645 response = open_url("GET", url)
647 headers = response.getheaders()
648 feedhandle = response
650 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
653 return # don't need to do anything, nothings changed.
655 response = open_url("GET", url)
657 headers = response.getheaders()
658 feedhandle = response
660 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
663 fp = feedparser.parse(feedhandle)
664 db = dbm.open(os.path.join(statedir, "seen"), "c")
665 for item in fp["items"]:
666 # have we seen it before?
667 # need to work out what the content is first...
669 if item.has_key("content"):
670 content = item["content"][0]["value"]
672 content = item["summary"]
674 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
678 # check if there's a guid too - if that exists and we match the md5,
680 if item.has_key("guid"):
681 if db.has_key(url + "|" + item["guid"]):
682 data = db[url + "|" + item["guid"]]
683 data = cgi.parse_qs(data)
684 if data["contentmd5"][0] == md5sum:
687 if db.has_key(url + "|" + item["link"]):
688 data = db[url + "|" + item["link"]]
689 data = cgi.parse_qs(data)
690 if data.has_key("message-id"):
691 prevmessageid = data["message-id"][0]
692 if data["contentmd5"][0] == md5sum:
696 author = item["author"]
700 # create a basic email message
701 msg = MIMEMultipart("alternative")
703 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
707 string.ascii_letters + string.digits \
708 ) for a in range(0,6) \
709 ]) + "@" + socket.gethostname() + ">"
710 msg.add_header("Message-ID", messageid)
711 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
712 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
713 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
715 msg.add_header("References", prevmessageid)
716 createddate = datetime.datetime.now() \
717 .strftime("%a, %e %b %Y %T -0000")
719 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
720 .strftime("%a, %e %b %Y %T -0000")
723 msg.add_header("Date", createddate)
724 subj_gen = HTML2Text()
725 subj_gen.feed(item["title"].encode("utf-8"))
726 msg.add_header("Subject", subj_gen.gettext())
727 msg.set_default_type("text/plain")
729 htmlcontent = content.encode("utf-8")
730 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
734 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
735 textparser = HTML2Text()
736 textparser.feed(content.encode("utf-8"))
737 textcontent = textparser.gettext()
738 textcontent = "%s\n\nItem URL: %s" %( \
741 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
745 # start by working out the filename we should be writting to, we do
746 # this following the normal maildir style rules
747 fname = str(os.getpid()) \
748 + "." + socket.gethostname() \
751 string.ascii_letters + string.digits \
752 ) for a in range(0,10) \
754 + datetime.datetime.now().strftime('%s')
755 fn = os.path.join(maildir, "tmp", fname)
757 fh.write(msg.as_string())
759 # now move it in to the new directory
760 newfn = os.path.join(maildir, "new", fname)
764 # now add to the database about the item
766 messageid = prevmessageid + " " + messageid
767 if item.has_key("guid") and item["guid"] != item["link"]:
768 data = urllib.urlencode(( \
769 ("message-id", messageid), \
770 ("created", createddate), \
771 ("contentmd5", md5sum) \
773 db[url + "|" + item["guid"]] = data
775 data = db[url + "|" + item["link"]]
776 data = cgi.parse_qs(data)
777 newdata = urllib.urlencode(( \
778 ("message-id", messageid), \
779 ("created", data["created"][0]), \
780 ("contentmd5", data["contentmd5"][0]) \
782 db[url + "|" + item["link"]] = newdata
784 db[url + "|" + item["link"]] = data
786 data = urllib.urlencode(( \
787 ("message-id", messageid), \
788 ("created", createddate), \
789 ("contentmd5", md5sum) \
791 db[url + "|" + item["link"]] = data
795 for header in headers:
797 ["content-md5", "etag", "last-modified", "content-length"]:
798 data.append((header[0], header[1]))
800 data = urllib.urlencode(data)
806 if __name__ == "__main__":
807 # This only gets executed if we really called the program
808 # first off, parse the command line arguments
810 oparser = OptionParser()
812 "-c", "--conf", dest="conf",
813 help="location of config file"
816 "-s", "--statedir", dest="statedir",
817 help="location of directory to store state in"
820 (options, args) = oparser.parse_args()
822 # check for the configfile
826 if options.conf != None:
827 # does the file exist?
829 os.stat(options.conf)
830 configfile = options.conf
832 # should exit here as the specified file doesn't exist
834 "Config file %s does not exist. Exiting.\n" %(options.conf,))
837 # check through the default locations
839 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
840 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
843 os.stat("/etc/rss2maildir.conf")
844 configfile = "/etc/rss2maildir.conf"
846 sys.stderr.write("No config file found. Exiting.\n")
849 # Right - if we've got this far, we've got a config file, now for the hard
852 scp = SafeConfigParser()
855 maildir_root = "RSSMaildir"
858 if options.statedir != None:
859 state_dir = options.statedir
861 mode = os.stat(state_dir)[stat.ST_MODE]
862 if not stat.S_ISDIR(mode):
864 "State directory (%s) is not a directory\n" %(state_dir))
867 # try to make the directory
871 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
873 elif scp.has_option("general", "state_dir"):
874 new_state_dir = scp.get("general", "state_dir")
876 mode = os.stat(new_state_dir)[stat.ST_MODE]
877 if not stat.S_ISDIR(mode):
879 "State directory (%s) is not a directory\n" %(state_dir))
882 state_dir = new_state_dir
886 os.mkdir(new_state_dir)
887 state_dir = new_state_dir
890 "Couldn't create state directory %s\n" %(new_state_dir))
894 mode = os.stat(state_dir)[stat.ST_MODE]
895 if not stat.S_ISDIR(mode):
897 "State directory %s is not a directory\n" %(state_dir))
904 "State directory %s could not be created\n" %(state_dir))
907 if scp.has_option("general", "maildir_root"):
908 maildir_root = scp.get("general", "maildir_root")
911 mode = os.stat(maildir_root)[stat.ST_MODE]
912 if not stat.S_ISDIR(mode):
914 "Maildir Root %s is not a directory\n" \
919 os.mkdir(maildir_root)
921 sys.stderr.write("Couldn't create Maildir Root %s\n" \
925 feeds = scp.sections()
927 feeds.remove("general")
931 for section in feeds:
932 # check if the directory exists
935 maildir = scp.get(section, "maildir")
939 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
940 maildir = os.path.join(maildir_root, maildir)
943 exists = os.stat(maildir)
944 if stat.S_ISDIR(exists[stat.ST_MODE]):
945 # check if there's a new, cur and tmp directory
947 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
949 os.mkdir(os.path.join(maildir, "cur"))
950 if not stat.S_ISDIR(mode):
951 sys.stderr.write("Broken maildir: %s\n" %(maildir))
953 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
955 os.mkdir(os.path.join(maildir, "tmp"))
956 if not stat.S_ISDIR(mode):
957 sys.stderr.write("Broken maildir: %s\n" %(maildir))
959 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
960 if not stat.S_ISDIR(mode):
961 sys.stderr.write("Broken maildir: %s\n" %(maildir))
963 os.mkdir(os.path.join(maildir, "new"))
965 sys.stderr.write("Broken maildir: %s\n" %(maildir))
970 sys.stderr.write("Couldn't create root maildir %s\n" \
974 os.mkdir(os.path.join(maildir, "new"))
975 os.mkdir(os.path.join(maildir, "cur"))
976 os.mkdir(os.path.join(maildir, "tmp"))
979 "Couldn't create required maildir directories for %s\n" \
983 # right - we've got the directories, we've got the section, we know the
986 parse_and_deliver(maildir, section, state_dir)