4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
207 def __init__(self,textwidth=70):
210 self.textwidth = textwidth
213 self.ignorenodata = False
217 HTMLParser.__init__(self)
219 def handle_starttag(self, tag, attrs):
220 tag_name = tag.lower()
221 if tag_name in self.blockleveltags:
222 # handle starting a new block - unless we're in a block element
223 # that can contain other blocks, we'll assume that we want to close
225 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
226 self.handle_curdata()
228 if tag_name == u'ol':
229 self.handle_curdata()
230 self.listcount.append(1)
231 self.listlevel = len(self.listcount) - 1
233 if tag_name == u'dl':
234 self.indentlevel = self.indentlevel + 4
236 if tag_name in self.liststarttags:
237 smallist = self.opentags[-3:-1]
239 for prev_listtag in smallist:
240 if prev_listtag in [u'dl', u'ol']:
241 self.indentlevel = self.indentlevel + 4
243 elif prev_listtag == u'ul':
244 self.indentlevel = self.indentlevel + 3
247 if len(self.opentags) > 0:
248 self.handle_curdata()
249 if tag_name not in self.cancontainflow:
251 self.opentags.append(tag_name)
253 if tag_name == "span":
257 listcount = self.listcount[-1]
261 if tag_name == u'dd' and len(self.opentags) > 1 \
262 and self.opentags[-1] == u'dt':
263 self.handle_curdata()
265 elif tag_name == u'dt' and len(self.opentags) > 1 \
266 and self.opentags[-1] == u'dd':
267 self.handle_curdata()
269 elif tag_name == u'a':
271 if attr[0].lower() == u'href':
272 self.urls.append(attr[1].decode('utf-8'))
273 self.curdata = self.curdata + u'`'
274 self.opentags.append(tag_name)
276 elif tag_name == u'img':
277 self.handle_image(attrs)
279 elif tag_name == u'br':
283 # we don't know the tag, so lets avoid handling it!
286 def handle_startendtag(self, tag, attrs):
287 if tag.lower() == u'br':
289 elif tag.lower() == u'img':
290 self.handle_image(attrs)
294 self.handle_curdata()
295 self.opentags.append(u'br')
296 self.handle_curdata()
299 def handle_image(self, attrs):
304 alt = attr[1].decode('utf-8')
305 elif attr[0] == 'src':
306 url = attr[1].decode('utf-8')
309 if self.images.has_key(alt):
310 if self.images[alt]["url"] == url:
311 self.curdata = self.curdata \
314 while self.images.has_key(alt):
316 self.images[alt]["url"] = url
317 self.curdata = self.curdata \
320 self.images[alt] = {}
321 self.images[alt]["url"] = url
322 self.curdata = self.curdata \
325 if self.images.has_key(url):
326 self.curdata = self.curdata \
329 self.images[url] = {}
330 self.images[url]["url"] =url
331 self.curdata = self.curdata \
334 def handle_curdata(self):
336 if len(self.opentags) == 0:
339 tag_thats_done = self.opentags[-1]
341 if len(self.curdata) == 0:
344 if tag_thats_done == u'br':
345 if len(self.text) == 0 or self.text[-1] != '\n':
346 self.text = self.text + '\n'
347 self.ignorenodata = True
350 if len(self.curdata.strip()) == 0:
353 if tag_thats_done in self.blockleveltags:
354 newlinerequired = self.text != u''
355 if self.ignorenodata:
356 newlinerequired = False
357 self.ignorenodata = False
359 if tag_thats_done in [u'dt', u'dd', u'li'] \
360 and len(self.text) > 1 \
361 and self.text[-1] != u'\n':
362 self.text = self.text + u'\n'
363 elif len(self.text) > 2 \
364 and self.text[-1] != u'\n' \
365 and self.text[-2] != u'\n':
366 self.text = self.text + u'\n\n'
368 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
371 headingtext = " ".join(self.curdata.split())
372 seperator = u'\n' + u' '*self.indentlevel
373 headingtext = seperator.join( \
376 self.textwidth - self.indentlevel \
380 if tag_thats_done == u'h2':
382 elif tag_thats_done != u'h1':
385 if u'\n' in headingtext:
386 underline = u' ' * self.indentlevel \
387 + underlinechar * (self.textwidth - self.indentlevel)
389 underline = u' ' * self.indentlevel \
390 + underlinechar * len(headingtext)
391 self.text = self.text \
392 + headingtext + u'\n' \
394 elif tag_thats_done in [u'p', u'div']:
395 paragraph = unicode( \
396 " ".join(self.curdata.strip().encode("utf-8").split()), \
398 seperator = u'\n' + u' ' * self.indentlevel
399 self.text = self.text \
400 + u' ' * self.indentlevel \
403 paragraph, self.textwidth - self.indentlevel))
404 elif tag_thats_done == "pre":
405 self.text = self.text + unicode( \
406 self.curdata.encode("utf-8"), "utf-8")
407 elif tag_thats_done == u'blockquote':
409 " ".join(self.curdata.encode("utf-8").strip().split()), \
411 seperator = u'\n' + u' ' * self.indentlevel + u'> '
412 if len(self.text) > 0 and self.text[-1] != u'\n':
413 self.text = self.text + u'\n'
414 self.text = self.text \
419 self.textwidth - self.indentlevel - 2 \
423 elif tag_thats_done == "li":
424 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
425 if len(self.text) > 0 and self.text[-1] != u'\n':
426 self.text = self.text + u'\n'
427 # work out if we're in an ol rather than a ul
428 latesttags = self.opentags[-4:]
431 for thing in latesttags:
445 listmarker = u' %2d. ' %(self.listcount[-1])
446 self.listcount[-1] = self.listcount[-1] + 1
449 + u' ' * self.indentlevel \
451 self.text = self.text \
452 + u' ' * self.indentlevel \
457 self.textwidth - self.indentlevel - listindent \
461 elif tag_thats_done == u'dt':
462 definition = unicode(" ".join( \
463 self.curdata.encode("utf-8").strip().split()), \
465 if len(self.text) > 0 and self.text[-1] != u'\n':
466 self.text = self.text + u'\n\n'
467 elif len(self.text) > 1 and self.text[-2] != u'\n':
468 self.text = self.text + u'\n'
469 definition = u' ' * (self.indentlevel - 4) + definition + "::"
470 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
471 self.text = self.text \
473 textwrap.wrap(definition, \
474 self.textwidth - self.indentlevel - 4))
476 elif tag_thats_done == u'dd':
477 definition = unicode(" ".join( \
478 self.curdata.encode("utf-8").strip().split()),
480 if len(definition) > 0:
481 if len(self.text) > 0 and self.text[-1] != u'\n':
482 self.text = self.text + u'\n'
483 indentstring = u'\n' + u' ' * self.indentlevel
484 self.text = self.text \
486 + indentstring.join( \
489 self.textwidth - self.indentlevel \
493 elif tag_thats_done == u'a':
494 self.curdata = self.curdata + u'`__'
496 elif tag_thats_done in self.liststarttags:
499 if tag_thats_done in self.blockleveltags:
502 self.ignorenodata = False
504 def handle_endtag(self, tag):
505 self.ignorenodata = False
510 tagindex = self.opentags.index(tag)
515 if tag in [u'br', u'img']:
519 self.indentlevel = self.indentlevel - 4
521 if tag in self.liststarttags:
522 if tag in [u'ol', u'dl', u'ul', u'dd']:
523 self.handle_curdata()
524 # find if there was a previous list level
525 smalllist = self.opentags[:-1]
527 for prev_listtag in smalllist:
528 if prev_listtag in [u'ol', u'dl']:
529 self.indentlevel = self.indentlevel - 4
531 elif prev_listtag == u'ul':
532 self.indentlevel = self.indentlevel - 3
536 self.listcount = self.listcount[:-1]
538 while tagindex < len(self.opentags) \
539 and tag in self.opentags[tagindex+1:]:
541 tagindex = self.opentags.index(tag, tagindex+1)
543 # well, we don't want to do that then
545 if tagindex != len(self.opentags) - 1:
546 # Assuming the data was for the last opened tag first
547 self.handle_curdata()
548 # Now kill the list to be a slice before this tag was opened
549 self.opentags = self.opentags[:tagindex + 1]
551 self.handle_curdata()
552 if self.opentags[-1] == tag:
555 def handle_data(self, data):
556 if len(self.opentags) == 0:
557 self.opentags.append(u'p')
558 self.curdata = self.curdata + data.decode("utf-8")
560 def handle_entityref(self, name):
562 if HTML2Text.entities.has_key(name):
563 entity = HTML2Text.entities[name]
565 entity = unichr(int(name[1:]))
567 entity = "&" + name + ";"
569 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
573 self.handle_curdata()
574 if len(self.text) == 0 or self.text[-1] != u'\n':
575 self.text = self.text + u'\n'
577 if len(self.text) > 0:
578 while len(self.text) > 1 and self.text[-1] == u'\n':
579 self.text = self.text[:-1]
580 self.text = self.text + u'\n'
581 if len(self.urls) > 0:
582 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
584 if len(self.images.keys()) > 0:
585 self.text = self.text + u'\n.. ' \
587 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
588 for a in self.images.keys()]) + u'\n'
592 def open_url(method, url):
594 while redirectcount < 3:
595 (type, rest) = urllib.splittype(url)
596 (host, path) = urllib.splithost(rest)
597 (host, port) = urllib.splitport(host)
601 conn = httplib.HTTPConnection("%s:%s" %(host, port))
602 conn.request(method, path)
603 response = conn.getresponse()
604 if response.status in [301, 302, 303, 307]:
605 headers = response.getheaders()
606 for header in headers:
607 if header[0] == "location":
609 elif response.status == 200:
613 redirectcount = redirectcount + 1
616 def parse_and_deliver(maildir, url, statedir):
619 # first check if we know about this feed already
620 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
621 if feeddb.has_key(url):
623 data = cgi.parse_qs(data)
624 response = open_url("HEAD", url)
627 headers = response.getheaders()
630 for header in headers:
631 if header[0] == "content-length":
632 if header[1] != data["content-length"][0]:
634 elif header[0] == "etag":
635 if header[1] != data["etag"][0]:
637 elif header[0] == "last-modified":
638 if header[1] != data["last-modified"][0]:
640 elif header[0] == "content-md5":
641 if header[1] != data["content-md5"][0]:
646 response = open_url("GET", url)
648 headers = response.getheaders()
649 feedhandle = response
651 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
654 return # don't need to do anything, nothings changed.
656 response = open_url("GET", url)
658 headers = response.getheaders()
659 feedhandle = response
661 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
664 fp = feedparser.parse(feedhandle)
665 db = dbm.open(os.path.join(statedir, "seen"), "c")
666 for item in fp["items"]:
667 # have we seen it before?
668 # need to work out what the content is first...
670 if item.has_key("content"):
671 content = item["content"][0]["value"]
673 content = item["summary"]
675 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
679 # check if there's a guid too - if that exists and we match the md5,
681 if item.has_key("guid"):
682 if db.has_key(url + "|" + item["guid"]):
683 data = db[url + "|" + item["guid"]]
684 data = cgi.parse_qs(data)
685 if data["contentmd5"][0] == md5sum:
688 if db.has_key(url + "|" + item["link"]):
689 data = db[url + "|" + item["link"]]
690 data = cgi.parse_qs(data)
691 if data.has_key("message-id"):
692 prevmessageid = data["message-id"][0]
693 if data["contentmd5"][0] == md5sum:
697 author = item["author"]
701 # create a basic email message
702 msg = MIMEMultipart("alternative")
704 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
708 string.ascii_letters + string.digits \
709 ) for a in range(0,6) \
710 ]) + "@" + socket.gethostname() + ">"
711 msg.add_header("Message-ID", messageid)
712 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
713 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
714 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
716 msg.add_header("References", prevmessageid)
717 createddate = datetime.datetime.now() \
718 .strftime("%a, %e %b %Y %T -0000")
720 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
721 .strftime("%a, %e %b %Y %T -0000")
724 msg.add_header("Date", createddate)
725 subj_gen = HTML2Text()
726 subj_gen.feed(item["title"].encode("utf-8"))
727 msg.add_header("Subject", subj_gen.gettext())
728 msg.set_default_type("text/plain")
730 htmlcontent = content.encode("utf-8")
731 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
735 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
736 textparser = HTML2Text()
737 textparser.feed(content.encode("utf-8"))
738 textcontent = textparser.gettext()
739 textcontent = "%s\n\nItem URL: %s" %( \
742 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
746 # start by working out the filename we should be writting to, we do
747 # this following the normal maildir style rules
748 fname = str(os.getpid()) \
749 + "." + socket.gethostname() \
752 string.ascii_letters + string.digits \
753 ) for a in range(0,10) \
755 + datetime.datetime.now().strftime('%s')
756 fn = os.path.join(maildir, "tmp", fname)
758 fh.write(msg.as_string())
760 # now move it in to the new directory
761 newfn = os.path.join(maildir, "new", fname)
765 # now add to the database about the item
767 messageid = prevmessageid + " " + messageid
768 if item.has_key("guid") and item["guid"] != item["link"]:
769 data = urllib.urlencode(( \
770 ("message-id", messageid), \
771 ("created", createddate), \
772 ("contentmd5", md5sum) \
774 db[url + "|" + item["guid"]] = data
776 data = db[url + "|" + item["link"]]
777 data = cgi.parse_qs(data)
778 newdata = urllib.urlencode(( \
779 ("message-id", messageid), \
780 ("created", data["created"][0]), \
781 ("contentmd5", data["contentmd5"][0]) \
783 db[url + "|" + item["link"]] = newdata
785 db[url + "|" + item["link"]] = data
787 data = urllib.urlencode(( \
788 ("message-id", messageid), \
789 ("created", createddate), \
790 ("contentmd5", md5sum) \
792 db[url + "|" + item["link"]] = data
796 for header in headers:
798 ["content-md5", "etag", "last-modified", "content-length"]:
799 data.append((header[0], header[1]))
801 data = urllib.urlencode(data)
807 if __name__ == "__main__":
808 # This only gets executed if we really called the program
809 # first off, parse the command line arguments
811 oparser = OptionParser()
813 "-c", "--conf", dest="conf",
814 help="location of config file"
817 "-s", "--statedir", dest="statedir",
818 help="location of directory to store state in"
821 (options, args) = oparser.parse_args()
823 # check for the configfile
827 if options.conf != None:
828 # does the file exist?
830 os.stat(options.conf)
831 configfile = options.conf
833 # should exit here as the specified file doesn't exist
835 "Config file %s does not exist. Exiting.\n" %(options.conf,))
838 # check through the default locations
840 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
841 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
844 os.stat("/etc/rss2maildir.conf")
845 configfile = "/etc/rss2maildir.conf"
847 sys.stderr.write("No config file found. Exiting.\n")
850 # Right - if we've got this far, we've got a config file, now for the hard
853 scp = SafeConfigParser()
856 maildir_root = "RSSMaildir"
859 if options.statedir != None:
860 state_dir = options.statedir
862 mode = os.stat(state_dir)[stat.ST_MODE]
863 if not stat.S_ISDIR(mode):
865 "State directory (%s) is not a directory\n" %(state_dir))
868 # try to make the directory
872 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
874 elif scp.has_option("general", "state_dir"):
875 new_state_dir = scp.get("general", "state_dir")
877 mode = os.stat(new_state_dir)[stat.ST_MODE]
878 if not stat.S_ISDIR(mode):
880 "State directory (%s) is not a directory\n" %(state_dir))
883 state_dir = new_state_dir
887 os.mkdir(new_state_dir)
888 state_dir = new_state_dir
891 "Couldn't create state directory %s\n" %(new_state_dir))
895 mode = os.stat(state_dir)[stat.ST_MODE]
896 if not stat.S_ISDIR(mode):
898 "State directory %s is not a directory\n" %(state_dir))
905 "State directory %s could not be created\n" %(state_dir))
908 if scp.has_option("general", "maildir_root"):
909 maildir_root = scp.get("general", "maildir_root")
912 mode = os.stat(maildir_root)[stat.ST_MODE]
913 if not stat.S_ISDIR(mode):
915 "Maildir Root %s is not a directory\n" \
920 os.mkdir(maildir_root)
922 sys.stderr.write("Couldn't create Maildir Root %s\n" \
926 feeds = scp.sections()
928 feeds.remove("general")
932 for section in feeds:
933 # check if the directory exists
936 maildir = scp.get(section, "maildir")
940 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
941 maildir = os.path.join(maildir_root, maildir)
944 exists = os.stat(maildir)
945 if stat.S_ISDIR(exists[stat.ST_MODE]):
946 # check if there's a new, cur and tmp directory
948 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
950 os.mkdir(os.path.join(maildir, "cur"))
951 if not stat.S_ISDIR(mode):
952 sys.stderr.write("Broken maildir: %s\n" %(maildir))
954 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
956 os.mkdir(os.path.join(maildir, "tmp"))
957 if not stat.S_ISDIR(mode):
958 sys.stderr.write("Broken maildir: %s\n" %(maildir))
960 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
961 if not stat.S_ISDIR(mode):
962 sys.stderr.write("Broken maildir: %s\n" %(maildir))
964 os.mkdir(os.path.join(maildir, "new"))
966 sys.stderr.write("Broken maildir: %s\n" %(maildir))
971 sys.stderr.write("Couldn't create root maildir %s\n" \
975 os.mkdir(os.path.join(maildir, "new"))
976 os.mkdir(os.path.join(maildir, "cur"))
977 os.mkdir(os.path.join(maildir, "tmp"))
980 "Couldn't create required maildir directories for %s\n" \
984 # right - we've got the directories, we've got the section, we know the
987 parse_and_deliver(maildir, section, state_dir)