4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
207 def __init__(self,textwidth=70):
210 self.textwidth = textwidth
213 self.ignorenodata = False
217 HTMLParser.__init__(self)
219 def handle_starttag(self, tag, attrs):
220 tag_name = tag.lower()
221 if tag_name in self.blockleveltags:
222 # handle starting a new block - unless we're in a block element
223 # that can contain other blocks, we'll assume that we want to close
225 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226 self.handle_curdata()
228 if tag_name == u'ol':
229 self.handle_curdata()
230 self.listcount.append(1)
231 self.listlevel = len(self.listcount) - 1
233 if tag_name == u'dl':
234 self.indentlevel = self.indentlevel + 4
236 if tag_name in self.liststarttags:
237 smallist = self.opentags[-3:-1]
239 for prev_listtag in smallist:
240 if prev_listtag in [u'dl', u'ol']:
241 self.indentlevel = self.indentlevel + 4
243 elif prev_listtag == u'ul':
244 self.indentlevel = self.indentlevel + 3
247 if len(self.opentags) > 0:
248 self.handle_curdata()
249 if tag_name not in self.cancontainflow:
251 self.opentags.append(tag_name)
253 if tag_name == "span":
257 listcount = self.listcount[-1]
261 if tag_name == u'dd' and len(self.opentags) > 1 \
262 and self.opentags[-1] == u'dt':
263 self.handle_curdata()
265 elif tag_name == u'dt' and len(self.opentags) > 1 \
266 and self.opentags[-1] == u'dd':
267 self.handle_curdata()
269 elif tag_name == u'a':
271 if attr[0].lower() == u'href':
272 self.urls.append(attr[1].decode('utf-8'))
273 self.curdata = self.curdata + u'`'
274 self.opentags.append(tag_name)
276 elif tag_name == u'img':
277 self.handle_image(attrs)
279 elif tag_name == u'br':
283 # we don't know the tag, so lets avoid handling it!
286 def handle_startendtag(self, tag, attrs):
287 if tag.lower() == u'br':
289 elif tag.lower() == u'img':
290 self.handle_image(attrs)
294 self.handle_curdata()
295 self.opentags.append(u'br')
296 self.handle_curdata()
299 def handle_image(self, attrs):
304 alt = attr[1].decode('utf-8')
305 elif attr[0] == 'src':
306 url = attr[1].decode('utf-8')
309 if self.images.has_key(alt):
310 if self.images[alt]["url"] == url:
311 self.curdata = self.curdata \
314 while self.images.has_key(alt):
316 self.images[alt] = {"url": url}
317 self.curdata = self.curdata \
320 self.images[alt] = {"url": url}
321 self.curdata = self.curdata \
324 if self.images.has_key(url):
325 self.curdata = self.curdata \
328 self.images[url] = {}
329 self.images[url]["url"] =url
330 self.curdata = self.curdata \
333 def handle_curdata(self):
335 if len(self.opentags) == 0:
338 tag_thats_done = self.opentags[-1]
340 if len(self.curdata) == 0:
343 if tag_thats_done == u'br':
344 if len(self.text) == 0 or self.text[-1] != '\n':
345 self.text = self.text + '\n'
346 self.ignorenodata = True
349 if len(self.curdata.strip()) == 0:
352 if tag_thats_done in self.blockleveltags:
353 newlinerequired = self.text != u''
354 if self.ignorenodata:
355 newlinerequired = False
356 self.ignorenodata = False
358 if tag_thats_done in [u'dt', u'dd', u'li'] \
359 and len(self.text) > 1 \
360 and self.text[-1] != u'\n':
361 self.text = self.text + u'\n'
362 elif len(self.text) > 2 \
363 and self.text[-1] != u'\n' \
364 and self.text[-2] != u'\n':
365 self.text = self.text + u'\n\n'
367 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
370 headingtext = " ".join(self.curdata.split())
371 seperator = u'\n' + u' '*self.indentlevel
372 headingtext = seperator.join( \
375 self.textwidth - self.indentlevel \
379 if tag_thats_done == u'h2':
381 elif tag_thats_done != u'h1':
384 if u'\n' in headingtext:
385 underline = u' ' * self.indentlevel \
386 + underlinechar * (self.textwidth - self.indentlevel)
388 underline = u' ' * self.indentlevel \
389 + underlinechar * len(headingtext)
390 self.text = self.text \
391 + headingtext + u'\n' \
393 elif tag_thats_done in [u'p', u'div']:
394 paragraph = unicode( \
395 " ".join(self.curdata.strip().encode("utf-8").split()), \
397 seperator = u'\n' + u' ' * self.indentlevel
398 self.text = self.text \
399 + u' ' * self.indentlevel \
402 paragraph, self.textwidth - self.indentlevel))
403 elif tag_thats_done == "pre":
404 self.text = self.text + unicode( \
405 self.curdata.encode("utf-8"), "utf-8")
406 elif tag_thats_done == u'blockquote':
408 " ".join(self.curdata.encode("utf-8").strip().split()), \
410 seperator = u'\n' + u' ' * self.indentlevel + u' '
411 if len(self.text) > 0 and self.text[-1] != u'\n':
412 self.text = self.text + u'\n'
413 self.text = self.text \
418 self.textwidth - self.indentlevel - 2 \
422 elif tag_thats_done == "li":
423 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
424 if len(self.text) > 0 and self.text[-1] != u'\n':
425 self.text = self.text + u'\n'
426 # work out if we're in an ol rather than a ul
427 latesttags = self.opentags[-4:]
430 for thing in latesttags:
444 listmarker = u' %2d. ' %(self.listcount[-1])
445 self.listcount[-1] = self.listcount[-1] + 1
448 + u' ' * self.indentlevel \
450 self.text = self.text \
451 + u' ' * self.indentlevel \
456 self.textwidth - self.indentlevel - listindent \
460 elif tag_thats_done == u'dt':
461 definition = unicode(" ".join( \
462 self.curdata.encode("utf-8").strip().split()), \
464 if len(self.text) > 0 and self.text[-1] != u'\n':
465 self.text = self.text + u'\n\n'
466 elif len(self.text) > 1 and self.text[-2] != u'\n':
467 self.text = self.text + u'\n'
468 definition = u' ' * (self.indentlevel - 4) + definition + "::"
469 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
470 self.text = self.text \
472 textwrap.wrap(definition, \
473 self.textwidth - self.indentlevel - 4))
475 elif tag_thats_done == u'dd':
476 definition = unicode(" ".join( \
477 self.curdata.encode("utf-8").strip().split()),
479 if len(definition) > 0:
480 if len(self.text) > 0 and self.text[-1] != u'\n':
481 self.text = self.text + u'\n'
482 indentstring = u'\n' + u' ' * self.indentlevel
483 self.text = self.text \
485 + indentstring.join( \
488 self.textwidth - self.indentlevel \
492 elif tag_thats_done == u'a':
493 self.curdata = self.curdata + u'`__'
495 elif tag_thats_done in self.liststarttags:
498 if tag_thats_done in self.blockleveltags:
501 self.ignorenodata = False
503 def handle_endtag(self, tag):
504 self.ignorenodata = False
509 tagindex = self.opentags.index(tag)
514 if tag in [u'br', u'img']:
518 self.indentlevel = self.indentlevel - 4
520 if tag in self.liststarttags:
521 if tag in [u'ol', u'dl', u'ul', u'dd']:
522 self.handle_curdata()
523 # find if there was a previous list level
524 smalllist = self.opentags[:-1]
526 for prev_listtag in smalllist:
527 if prev_listtag in [u'ol', u'dl']:
528 self.indentlevel = self.indentlevel - 4
530 elif prev_listtag == u'ul':
531 self.indentlevel = self.indentlevel - 3
535 self.listcount = self.listcount[:-1]
537 while tagindex < len(self.opentags) \
538 and tag in self.opentags[tagindex+1:]:
540 tagindex = self.opentags.index(tag, tagindex+1)
542 # well, we don't want to do that then
544 if tagindex != len(self.opentags) - 1:
545 # Assuming the data was for the last opened tag first
546 self.handle_curdata()
547 # Now kill the list to be a slice before this tag was opened
548 self.opentags = self.opentags[:tagindex + 1]
550 self.handle_curdata()
551 if self.opentags[-1] == tag:
554 def handle_data(self, data):
555 if len(self.opentags) == 0:
556 self.opentags.append(u'p')
557 self.curdata = self.curdata + data.decode("utf-8")
559 def handle_charref(self, name):
560 entity = unichr(int(name))
561 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
564 def handle_entityref(self, name):
566 if HTML2Text.entities.has_key(name):
567 entity = HTML2Text.entities[name]
569 entity = "&" + name + ";"
571 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
575 self.handle_curdata()
576 if len(self.text) == 0 or self.text[-1] != u'\n':
577 self.text = self.text + u'\n'
579 if len(self.text) > 0:
580 while len(self.text) > 1 and self.text[-1] == u'\n':
581 self.text = self.text[:-1]
582 self.text = self.text + u'\n'
583 if len(self.urls) > 0:
584 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
586 if len(self.images.keys()) > 0:
587 self.text = self.text + u'\n.. ' \
589 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
590 for a in self.images.keys()]) + u'\n'
594 def open_url(method, url):
596 while redirectcount < 3:
597 (type, rest) = urllib.splittype(url)
598 (host, path) = urllib.splithost(rest)
599 (host, port) = urllib.splitport(host)
603 conn = httplib.HTTPConnection("%s:%s" %(host, port))
604 conn.request(method, path)
605 response = conn.getresponse()
606 if response.status in [301, 302, 303, 307]:
607 headers = response.getheaders()
608 for header in headers:
609 if header[0] == "location":
611 elif response.status == 200:
615 redirectcount = redirectcount + 1
618 def parse_and_deliver(maildir, url, statedir):
621 # first check if we know about this feed already
622 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
623 if feeddb.has_key(url):
625 data = cgi.parse_qs(data)
626 response = open_url("HEAD", url)
629 headers = response.getheaders()
632 for header in headers:
633 if header[0] == "content-length":
634 if header[1] != data["content-length"][0]:
636 elif header[0] == "etag":
637 if header[1] != data["etag"][0]:
639 elif header[0] == "last-modified":
640 if header[1] != data["last-modified"][0]:
642 elif header[0] == "content-md5":
643 if header[1] != data["content-md5"][0]:
648 response = open_url("GET", url)
650 headers = response.getheaders()
651 feedhandle = response
653 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
656 return # don't need to do anything, nothings changed.
658 response = open_url("GET", url)
660 headers = response.getheaders()
661 feedhandle = response
663 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
666 fp = feedparser.parse(feedhandle)
667 db = dbm.open(os.path.join(statedir, "seen"), "c")
668 for item in fp["items"]:
669 # have we seen it before?
670 # need to work out what the content is first...
672 if item.has_key("content"):
673 content = item["content"][0]["value"]
675 content = item["summary"]
677 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
681 # check if there's a guid too - if that exists and we match the md5,
683 if item.has_key("guid"):
684 if db.has_key(url + "|" + item["guid"]):
685 data = db[url + "|" + item["guid"]]
686 data = cgi.parse_qs(data)
687 if data["contentmd5"][0] == md5sum:
690 if db.has_key(url + "|" + item["link"]):
691 data = db[url + "|" + item["link"]]
692 data = cgi.parse_qs(data)
693 if data.has_key("message-id"):
694 prevmessageid = data["message-id"][0]
695 if data["contentmd5"][0] == md5sum:
699 author = item["author"]
703 # create a basic email message
704 msg = MIMEMultipart("alternative")
706 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
710 string.ascii_letters + string.digits \
711 ) for a in range(0,6) \
712 ]) + "@" + socket.gethostname() + ">"
713 msg.add_header("Message-ID", messageid)
714 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
715 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
716 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
718 msg.add_header("References", prevmessageid)
719 createddate = datetime.datetime.now() \
720 .strftime("%a, %e %b %Y %T -0000")
722 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
723 .strftime("%a, %e %b %Y %T -0000")
726 msg.add_header("Date", createddate)
727 subj_gen = HTML2Text()
728 subj_gen.feed(item["title"].encode("utf-8"))
729 msg.add_header("Subject", subj_gen.gettext())
730 msg.set_default_type("text/plain")
732 htmlcontent = content.encode("utf-8")
733 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
737 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
738 textparser = HTML2Text()
739 textparser.feed(content.encode("utf-8"))
740 textcontent = textparser.gettext()
741 textcontent = "%s\n\nItem URL: %s" %( \
744 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
748 # start by working out the filename we should be writting to, we do
749 # this following the normal maildir style rules
750 fname = str(os.getpid()) \
751 + "." + socket.gethostname() \
754 string.ascii_letters + string.digits \
755 ) for a in range(0,10) \
757 + datetime.datetime.now().strftime('%s')
758 fn = os.path.join(maildir, "tmp", fname)
760 fh.write(msg.as_string())
762 # now move it in to the new directory
763 newfn = os.path.join(maildir, "new", fname)
767 # now add to the database about the item
769 messageid = prevmessageid + " " + messageid
770 if item.has_key("guid") and item["guid"] != item["link"]:
771 data = urllib.urlencode(( \
772 ("message-id", messageid), \
773 ("created", createddate), \
774 ("contentmd5", md5sum) \
776 db[url + "|" + item["guid"]] = data
778 data = db[url + "|" + item["link"]]
779 data = cgi.parse_qs(data)
780 newdata = urllib.urlencode(( \
781 ("message-id", messageid), \
782 ("created", data["created"][0]), \
783 ("contentmd5", data["contentmd5"][0]) \
785 db[url + "|" + item["link"]] = newdata
787 db[url + "|" + item["link"]] = data
789 data = urllib.urlencode(( \
790 ("message-id", messageid), \
791 ("created", createddate), \
792 ("contentmd5", md5sum) \
794 db[url + "|" + item["link"]] = data
798 for header in headers:
800 ["content-md5", "etag", "last-modified", "content-length"]:
801 data.append((header[0], header[1]))
803 data = urllib.urlencode(data)
809 if __name__ == "__main__":
810 # This only gets executed if we really called the program
811 # first off, parse the command line arguments
813 oparser = OptionParser()
815 "-c", "--conf", dest="conf",
816 help="location of config file"
819 "-s", "--statedir", dest="statedir",
820 help="location of directory to store state in"
823 (options, args) = oparser.parse_args()
825 # check for the configfile
829 if options.conf != None:
830 # does the file exist?
832 os.stat(options.conf)
833 configfile = options.conf
835 # should exit here as the specified file doesn't exist
837 "Config file %s does not exist. Exiting.\n" %(options.conf,))
840 # check through the default locations
842 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
843 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
846 os.stat("/etc/rss2maildir.conf")
847 configfile = "/etc/rss2maildir.conf"
849 sys.stderr.write("No config file found. Exiting.\n")
852 # Right - if we've got this far, we've got a config file, now for the hard
855 scp = SafeConfigParser()
858 maildir_root = "RSSMaildir"
861 if options.statedir != None:
862 state_dir = options.statedir
864 mode = os.stat(state_dir)[stat.ST_MODE]
865 if not stat.S_ISDIR(mode):
867 "State directory (%s) is not a directory\n" %(state_dir))
870 # try to make the directory
874 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
876 elif scp.has_option("general", "state_dir"):
877 new_state_dir = scp.get("general", "state_dir")
879 mode = os.stat(new_state_dir)[stat.ST_MODE]
880 if not stat.S_ISDIR(mode):
882 "State directory (%s) is not a directory\n" %(state_dir))
885 state_dir = new_state_dir
889 os.mkdir(new_state_dir)
890 state_dir = new_state_dir
893 "Couldn't create state directory %s\n" %(new_state_dir))
897 mode = os.stat(state_dir)[stat.ST_MODE]
898 if not stat.S_ISDIR(mode):
900 "State directory %s is not a directory\n" %(state_dir))
907 "State directory %s could not be created\n" %(state_dir))
910 if scp.has_option("general", "maildir_root"):
911 maildir_root = scp.get("general", "maildir_root")
914 mode = os.stat(maildir_root)[stat.ST_MODE]
915 if not stat.S_ISDIR(mode):
917 "Maildir Root %s is not a directory\n" \
922 os.mkdir(maildir_root)
924 sys.stderr.write("Couldn't create Maildir Root %s\n" \
928 feeds = scp.sections()
930 feeds.remove("general")
934 for section in feeds:
935 # check if the directory exists
938 maildir = scp.get(section, "maildir")
942 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
943 maildir = os.path.join(maildir_root, maildir)
946 exists = os.stat(maildir)
947 if stat.S_ISDIR(exists[stat.ST_MODE]):
948 # check if there's a new, cur and tmp directory
950 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
952 os.mkdir(os.path.join(maildir, "cur"))
953 if not stat.S_ISDIR(mode):
954 sys.stderr.write("Broken maildir: %s\n" %(maildir))
956 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
958 os.mkdir(os.path.join(maildir, "tmp"))
959 if not stat.S_ISDIR(mode):
960 sys.stderr.write("Broken maildir: %s\n" %(maildir))
962 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
963 if not stat.S_ISDIR(mode):
964 sys.stderr.write("Broken maildir: %s\n" %(maildir))
966 os.mkdir(os.path.join(maildir, "new"))
968 sys.stderr.write("Broken maildir: %s\n" %(maildir))
973 sys.stderr.write("Couldn't create root maildir %s\n" \
977 os.mkdir(os.path.join(maildir, "new"))
978 os.mkdir(os.path.join(maildir, "cur"))
979 os.mkdir(os.path.join(maildir, "tmp"))
982 "Couldn't create required maildir directories for %s\n" \
986 # right - we've got the directories, we've got the section, we know the
989 parse_and_deliver(maildir, section, state_dir)