4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
174 def __init__(self,textwidth=70):
177 self.textwidth = textwidth
180 self.ignorenodata = False
184 HTMLParser.__init__(self)
186 def handle_starttag(self, tag, attrs):
187 tag_name = tag.lower()
188 if tag_name in self.blockleveltags:
189 # handle starting a new block - unless we're in a block element
190 # that can contain other blocks, we'll assume that we want to close
192 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193 self.handle_curdata()
195 if tag_name == u'ol':
196 self.handle_curdata()
197 self.listcount.append(1)
198 self.listlevel = len(self.listcount) - 1
200 if tag_name == u'dl':
201 self.indentlevel = self.indentlevel + 4
203 if tag_name in self.liststarttags:
204 smallist = self.opentags[-3:-1]
206 for prev_listtag in smallist:
207 if prev_listtag in [u'dl', u'ol']:
208 self.indentlevel = self.indentlevel + 4
210 elif prev_listtag == u'ul':
211 self.indentlevel = self.indentlevel + 3
214 if len(self.opentags) > 0:
215 self.handle_curdata()
216 if tag_name not in self.cancontainflow:
218 self.opentags.append(tag_name)
220 if tag_name == "span":
224 listcount = self.listcount[-1]
228 if tag_name == u'dd' and len(self.opentags) > 1 \
229 and self.opentags[-1] == u'dt':
230 self.handle_curdata()
232 elif tag_name == u'dt' and len(self.opentags) > 1 \
233 and self.opentags[-1] == u'dd':
234 self.handle_curdata()
236 elif tag_name == u'a':
238 if attr[0].lower() == u'href':
239 self.urls.append(attr[1].decode('utf-8'))
240 self.curdata = self.curdata + u'`'
241 self.opentags.append(tag_name)
243 elif tag_name == u'img':
244 self.handle_image(attrs)
246 elif tag_name == u'br':
250 # we don't know the tag, so lets avoid handling it!
253 def handle_startendtag(self, tag, attrs):
254 if tag.lower() == u'br':
256 elif tag.lower() == u'img':
257 self.handle_image(attrs)
261 self.handle_curdata()
262 self.opentags.append(u'br')
263 self.handle_curdata()
266 def handle_image(self, attrs):
271 alt = attr[1].decode('utf-8')
272 elif attr[0] == 'src':
273 url = attr[1].decode('utf-8')
276 if self.images.has_key(alt):
277 if self.images[alt]["url"] == url:
278 self.curdata = self.curdata \
281 while self.images.has_key(alt):
283 self.images[alt]["url"] = url
284 self.curdata = self.curdata \
287 self.images[alt] = {}
288 self.images[alt]["url"] = url
289 self.curdata = self.curdata \
292 if self.images.has_key(url):
293 self.curdata = self.curdata \
296 self.images[url] = {}
297 self.images[url]["url"] =url
298 self.curdata = self.curdata \
301 def handle_curdata(self):
303 if len(self.opentags) == 0:
306 tag_thats_done = self.opentags[-1]
308 if len(self.curdata) == 0:
311 if tag_thats_done == u'br':
312 if len(self.text) == 0 or self.text[-1] != '\n':
313 self.text = self.text + '\n'
314 self.ignorenodata = True
317 if len(self.curdata.strip()) == 0:
320 if tag_thats_done in self.blockleveltags:
321 newlinerequired = self.text != u''
322 if self.ignorenodata:
323 newlinerequired = False
324 self.ignorenodata = False
326 if tag_thats_done in [u'dt', u'dd', u'li'] \
327 and len(self.text) > 1 \
328 and self.text[-1] != u'\n':
329 self.text = self.text + u'\n'
330 elif len(self.text) > 2 \
331 and self.text[-1] != u'\n' \
332 and self.text[-2] != u'\n':
333 self.text = self.text + u'\n\n'
335 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
338 headingtext = " ".join(self.curdata.split())
339 seperator = u'\n' + u' '*self.indentlevel
340 headingtext = seperator.join( \
343 self.textwidth - self.indentlevel \
347 if tag_thats_done == u'h2':
349 elif tag_thats_done != u'h1':
352 if u'\n' in headingtext:
353 underline = u' ' * self.indentlevel \
354 + underlinechar * (self.textwidth - self.indentlevel)
356 underline = u' ' * self.indentlevel \
357 + underlinechar * len(headingtext)
358 self.text = self.text \
359 + headingtext + u'\n' \
361 elif tag_thats_done in [u'p', u'div']:
362 paragraph = unicode( \
363 " ".join(self.curdata.strip().encode("utf-8").split()), \
365 seperator = u'\n' + u' ' * self.indentlevel
366 self.text = self.text \
367 + u' ' * self.indentlevel \
370 paragraph, self.textwidth - self.indentlevel))
371 elif tag_thats_done == "pre":
372 self.text = self.text + unicode( \
373 self.curdata.encode("utf-8"), "utf-8")
374 elif tag_thats_done == u'blockquote':
376 " ".join(self.curdata.encode("utf-8").strip().split()), \
378 seperator = u'\n' + u' ' * self.indentlevel + u'> '
379 if len(self.text) > 0 and self.text[-1] != u'\n':
380 self.text = self.text + u'\n'
381 self.text = self.text \
386 self.textwidth - self.indentlevel - 2 \
390 elif tag_thats_done == "li":
391 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
392 if len(self.text) > 0 and self.text[-1] != u'\n':
393 self.text = self.text + u'\n'
394 # work out if we're in an ol rather than a ul
395 latesttags = self.opentags[-4:]
398 for thing in latesttags:
412 listmarker = u' %2d. ' %(self.listcount[-1])
413 self.listcount[-1] = self.listcount[-1] + 1
416 + u' ' * self.indentlevel \
418 self.text = self.text \
419 + u' ' * self.indentlevel \
424 self.textwidth - self.indentlevel - listindent \
428 elif tag_thats_done == u'dt':
429 definition = unicode(" ".join( \
430 self.curdata.encode("utf-8").strip().split()), \
432 if len(self.text) > 0 and self.text[-1] != u'\n':
433 self.text = self.text + u'\n\n'
434 elif len(self.text) > 1 and self.text[-2] != u'\n':
435 self.text = self.text + u'\n'
436 definition = u' ' * (self.indentlevel - 4) + definition + "::"
437 indentstring = u'\n' + u' ' * (self.indentlevel - 3)
438 self.text = self.text \
440 textwrap.wrap(definition, \
441 self.textwidth - self.indentlevel - 4))
443 elif tag_thats_done == u'dd':
444 definition = unicode(" ".join( \
445 self.curdata.encode("utf-8").strip().split()),
447 if len(definition) > 0:
448 if len(self.text) > 0 and self.text[-1] != u'\n':
449 self.text = self.text + u'\n'
450 indentstring = u'\n' + u' ' * self.indentlevel
451 self.text = self.text \
453 + indentstring.join( \
456 self.textwidth - self.indentlevel \
460 elif tag_thats_done == u'a':
461 self.curdata = self.curdata + u'`__'
463 elif tag_thats_done in self.liststarttags:
466 if tag_thats_done in self.blockleveltags:
469 self.ignorenodata = False
471 def handle_endtag(self, tag):
472 self.ignorenodata = False
477 tagindex = self.opentags.index(tag)
482 if tag in [u'br', u'img']:
486 self.indentlevel = self.indentlevel - 4
488 if tag in self.liststarttags:
489 if tag in [u'ol', u'dl', u'ul', u'dd']:
490 self.handle_curdata()
491 # find if there was a previous list level
492 smalllist = self.opentags[:-1]
494 for prev_listtag in smalllist:
495 if prev_listtag in [u'ol', u'dl']:
496 self.indentlevel = self.indentlevel - 4
498 elif prev_listtag == u'ul':
499 self.indentlevel = self.indentlevel - 3
503 self.listcount = self.listcount[:-1]
505 while tagindex < len(self.opentags) \
506 and tag in self.opentags[tagindex+1:]:
508 tagindex = self.opentags.index(tag, tagindex+1)
510 # well, we don't want to do that then
512 if tagindex != len(self.opentags) - 1:
513 # Assuming the data was for the last opened tag first
514 self.handle_curdata()
515 # Now kill the list to be a slice before this tag was opened
516 self.opentags = self.opentags[:tagindex + 1]
518 self.handle_curdata()
519 if self.opentags[-1] == tag:
522 def handle_data(self, data):
523 if len(self.opentags) == 0:
524 self.opentags.append(u'p')
525 self.curdata = self.curdata + data.decode("utf-8")
527 def handle_entityref(self, name):
529 if HTML2Text.entities.has_key(name):
530 entity = HTML2Text.entities[name]
532 entity = unichr(int(name[1:]))
534 entity = "&" + name + ";"
536 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
540 self.handle_curdata()
541 if len(self.text) == 0 or self.text[-1] != u'\n':
542 self.text = self.text + u'\n'
544 if len(self.text) > 0:
545 while len(self.text) > 1 and self.text[-1] == u'\n':
546 self.text = self.text[:-1]
547 self.text = self.text + u'\n'
548 if len(self.urls) > 0:
549 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
551 if len(self.images.keys()) > 0:
552 self.text = self.text + u'\n.. ' \
554 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
555 for a in self.images.keys()]) + u'\n'
559 def open_url(method, url):
561 while redirectcount < 3:
562 (type, rest) = urllib.splittype(url)
563 (host, path) = urllib.splithost(rest)
564 (host, port) = urllib.splitport(host)
568 conn = httplib.HTTPConnection("%s:%s" %(host, port))
569 conn.request(method, path)
570 response = conn.getresponse()
571 if response.status in [301, 302, 303, 307]:
572 headers = response.getheaders()
573 for header in headers:
574 if header[0] == "location":
576 elif response.status == 200:
580 redirectcount = redirectcount + 1
583 def parse_and_deliver(maildir, url, statedir):
586 # first check if we know about this feed already
587 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
588 if feeddb.has_key(url):
590 data = cgi.parse_qs(data)
591 response = open_url("HEAD", url)
594 headers = response.getheaders()
597 for header in headers:
598 if header[0] == "content-length":
599 if header[1] != data["content-length"][0]:
601 elif header[0] == "etag":
602 if header[1] != data["etag"][0]:
604 elif header[0] == "last-modified":
605 if header[1] != data["last-modified"][0]:
607 elif header[0] == "content-md5":
608 if header[1] != data["content-md5"][0]:
613 response = open_url("GET", url)
615 headers = response.getheaders()
616 feedhandle = response
618 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
621 return # don't need to do anything, nothings changed.
623 response = open_url("GET", url)
625 headers = response.getheaders()
626 feedhandle = response
628 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
631 fp = feedparser.parse(feedhandle)
632 db = dbm.open(os.path.join(statedir, "seen"), "c")
633 for item in fp["items"]:
634 # have we seen it before?
635 # need to work out what the content is first...
637 if item.has_key("content"):
638 content = item["content"][0]["value"]
640 content = item["summary"]
642 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
646 # check if there's a guid too - if that exists and we match the md5,
648 if item.has_key("guid"):
649 if db.has_key(url + "|" + item["guid"]):
650 data = db[url + "|" + item["guid"]]
651 data = cgi.parse_qs(data)
652 if data["contentmd5"][0] == md5sum:
655 if db.has_key(url + "|" + item["link"]):
656 data = db[url + "|" + item["link"]]
657 data = cgi.parse_qs(data)
658 if data.has_key("message-id"):
659 prevmessageid = data["message-id"][0]
660 if data["contentmd5"][0] == md5sum:
664 author = item["author"]
668 # create a basic email message
669 msg = MIMEMultipart("alternative")
671 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
675 string.ascii_letters + string.digits \
676 ) for a in range(0,6) \
677 ]) + "@" + socket.gethostname() + ">"
678 msg.add_header("Message-ID", messageid)
679 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
680 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
681 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
683 msg.add_header("References", prevmessageid)
684 createddate = datetime.datetime.now() \
685 .strftime("%a, %e %b %Y %T -0000")
687 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
688 .strftime("%a, %e %b %Y %T -0000")
691 msg.add_header("Date", createddate)
692 subj_gen = HTML2Text()
693 subj_gen.feed(item["title"].encode("utf-8"))
694 msg.add_header("Subject", subj_gen.gettext())
695 msg.set_default_type("text/plain")
697 htmlcontent = content.encode("utf-8")
698 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
702 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
703 textparser = HTML2Text()
704 textparser.feed(content.encode("utf-8"))
705 textcontent = textparser.gettext()
706 textcontent = "%s\n\nItem URL: %s" %( \
709 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
713 # start by working out the filename we should be writting to, we do
714 # this following the normal maildir style rules
715 fname = str(os.getpid()) \
716 + "." + socket.gethostname() \
719 string.ascii_letters + string.digits \
720 ) for a in range(0,10) \
722 + datetime.datetime.now().strftime('%s')
723 fn = os.path.join(maildir, "tmp", fname)
725 fh.write(msg.as_string())
727 # now move it in to the new directory
728 newfn = os.path.join(maildir, "new", fname)
732 # now add to the database about the item
734 messageid = prevmessageid + " " + messageid
735 if item.has_key("guid") and item["guid"] != item["link"]:
736 data = urllib.urlencode(( \
737 ("message-id", messageid), \
738 ("created", createddate), \
739 ("contentmd5", md5sum) \
741 db[url + "|" + item["guid"]] = data
743 data = db[url + "|" + item["link"]]
744 data = cgi.parse_qs(data)
745 newdata = urllib.urlencode(( \
746 ("message-id", messageid), \
747 ("created", data["created"][0]), \
748 ("contentmd5", data["contentmd5"][0]) \
750 db[url + "|" + item["link"]] = newdata
752 db[url + "|" + item["link"]] = data
754 data = urllib.urlencode(( \
755 ("message-id", messageid), \
756 ("created", createddate), \
757 ("contentmd5", md5sum) \
759 db[url + "|" + item["link"]] = data
763 for header in headers:
765 ["content-md5", "etag", "last-modified", "content-length"]:
766 data.append((header[0], header[1]))
768 data = urllib.urlencode(data)
774 if __name__ == "__main__":
775 # This only gets executed if we really called the program
776 # first off, parse the command line arguments
778 oparser = OptionParser()
780 "-c", "--conf", dest="conf",
781 help="location of config file"
784 "-s", "--statedir", dest="statedir",
785 help="location of directory to store state in"
788 (options, args) = oparser.parse_args()
790 # check for the configfile
794 if options.conf != None:
795 # does the file exist?
797 os.stat(options.conf)
798 configfile = options.conf
800 # should exit here as the specified file doesn't exist
802 "Config file %s does not exist. Exiting.\n" %(options.conf,))
805 # check through the default locations
807 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
808 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
811 os.stat("/etc/rss2maildir.conf")
812 configfile = "/etc/rss2maildir.conf"
814 sys.stderr.write("No config file found. Exiting.\n")
817 # Right - if we've got this far, we've got a config file, now for the hard
820 scp = SafeConfigParser()
823 maildir_root = "RSSMaildir"
826 if options.statedir != None:
827 state_dir = options.statedir
829 mode = os.stat(state_dir)[stat.ST_MODE]
830 if not stat.S_ISDIR(mode):
832 "State directory (%s) is not a directory\n" %(state_dir))
835 # try to make the directory
839 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
841 elif scp.has_option("general", "state_dir"):
842 new_state_dir = scp.get("general", "state_dir")
844 mode = os.stat(new_state_dir)[stat.ST_MODE]
845 if not stat.S_ISDIR(mode):
847 "State directory (%s) is not a directory\n" %(state_dir))
850 state_dir = new_state_dir
854 os.mkdir(new_state_dir)
855 state_dir = new_state_dir
858 "Couldn't create state directory %s\n" %(new_state_dir))
862 mode = os.stat(state_dir)[stat.ST_MODE]
863 if not stat.S_ISDIR(mode):
865 "State directory %s is not a directory\n" %(state_dir))
872 "State directory %s could not be created\n" %(state_dir))
875 if scp.has_option("general", "maildir_root"):
876 maildir_root = scp.get("general", "maildir_root")
879 mode = os.stat(maildir_root)[stat.ST_MODE]
880 if not stat.S_ISDIR(mode):
882 "Maildir Root %s is not a directory\n" \
887 os.mkdir(maildir_root)
889 sys.stderr.write("Couldn't create Maildir Root %s\n" \
893 feeds = scp.sections()
895 feeds.remove("general")
899 for section in feeds:
900 # check if the directory exists
903 maildir = scp.get(section, "maildir")
907 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
908 maildir = os.path.join(maildir_root, maildir)
911 exists = os.stat(maildir)
912 if stat.S_ISDIR(exists[stat.ST_MODE]):
913 # check if there's a new, cur and tmp directory
915 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
917 os.mkdir(os.path.join(maildir, "cur"))
918 if not stat.S_ISDIR(mode):
919 sys.stderr.write("Broken maildir: %s\n" %(maildir))
921 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
923 os.mkdir(os.path.join(maildir, "tmp"))
924 if not stat.S_ISDIR(mode):
925 sys.stderr.write("Broken maildir: %s\n" %(maildir))
927 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
928 if not stat.S_ISDIR(mode):
929 sys.stderr.write("Broken maildir: %s\n" %(maildir))
931 os.mkdir(os.path.join(maildir, "new"))
933 sys.stderr.write("Broken maildir: %s\n" %(maildir))
938 sys.stderr.write("Couldn't create root maildir %s\n" \
942 os.mkdir(os.path.join(maildir, "new"))
943 os.mkdir(os.path.join(maildir, "cur"))
944 os.mkdir(os.path.join(maildir, "tmp"))
947 "Couldn't create required maildir directories for %s\n" \
951 # right - we've got the directories, we've got the section, we know the
954 parse_and_deliver(maildir, section, state_dir)