4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
174 def __init__(self,textwidth=70):
177 self.textwidth = textwidth
180 self.ignorenodata = False
184 HTMLParser.__init__(self)
186 def handle_starttag(self, tag, attrs):
187 tag_name = tag.lower()
188 if tag_name in self.blockleveltags:
189 # handle starting a new block - unless we're in a block element
190 # that can contain other blocks, we'll assume that we want to close
192 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193 self.handle_curdata()
195 if tag_name == u'ol':
196 self.handle_curdata()
197 self.listcount.append(1)
198 self.listlevel = len(self.listcount) - 1
200 if tag_name in self.liststarttags:
201 smallist = self.opentags[-3:-1]
203 for prev_listtag in smallist:
204 if prev_listtag in [u'dl', u'ol']:
205 self.indentlevel = self.indentlevel + 4
207 elif prev_listtag == u'ul':
208 self.indentlevel = self.indentlevel + 3
211 if len(self.opentags) > 0:
212 self.handle_curdata()
213 if tag_name not in self.cancontainflow:
215 self.opentags.append(tag_name)
217 if tag_name == "span":
221 listcount = self.listcount[-1]
225 if tag_name == u'dd' and len(self.opentags) > 1 \
226 and self.opentags[-1] == u'dt':
227 self.handle_curdata()
229 elif tag_name == u'dt' and len(self.opentags) > 1 \
230 and self.opentags[-1] == u'dd':
231 self.handle_curdata()
233 elif tag_name == u'a':
235 if attr[0].lower() == u'href':
236 self.urls.append(attr[1].decode('utf-8'))
237 self.curdata = self.curdata + u'`'
238 self.opentags.append(tag_name)
240 elif tag_name == u'img':
241 self.handle_image(attrs)
243 elif tag_name == u'br':
247 # we don't know the tag, so lets avoid handling it!
250 def handle_startendtag(self, tag, attrs):
251 if tag.lower() == u'br':
253 elif tag.lower() == u'img':
254 self.handle_image(attrs)
258 self.handle_curdata()
259 self.opentags.append(u'br')
260 self.handle_curdata()
263 def handle_image(self, attrs):
268 alt = attr[1].decode('utf-8')
269 elif attr[0] == 'src':
270 url = attr[1].decode('utf-8')
273 if self.images.has_key(alt):
274 if self.images[alt]["url"] == url:
275 self.curdata = self.curdata \
278 while self.images.has_key(alt):
280 self.images[alt]["url"] = url
281 self.curdata = self.curdata \
284 self.images[alt] = {}
285 self.images[alt]["url"] = url
286 self.curdata = self.curdata \
289 if self.images.has_key(url):
290 self.curdata = self.curdata \
293 self.images[url] = {}
294 self.images[url]["url"] =url
295 self.curdata = self.curdata \
298 def handle_curdata(self):
300 if len(self.opentags) == 0:
303 tag_thats_done = self.opentags[-1]
305 if len(self.curdata) == 0:
308 if tag_thats_done == u'br':
309 if len(self.text) == 0 or self.text[-1] != '\n':
310 self.text = self.text + '\n'
311 self.ignorenodata = True
314 if len(self.curdata.strip()) == 0:
317 if tag_thats_done in self.blockleveltags:
318 newlinerequired = self.text != u''
319 if self.ignorenodata:
320 newlinerequired = False
321 self.ignorenodata = False
323 if tag_thats_done in [u'dt', u'dd', u'li'] \
324 and len(self.text) > 1 \
325 and self.text[-1] != u'\n':
326 self.text = self.text + u'\n'
327 elif len(self.text) > 2 \
328 and self.text[-1] != u'\n' \
329 and self.text[-2] != u'\n':
330 self.text = self.text + u'\n\n'
332 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
335 headingtext = " ".join(self.curdata.split())
336 seperator = u'\n' + u' '*self.indentlevel
337 headingtext = seperator.join( \
340 self.textwidth - self.indentlevel \
344 if tag_thats_done == u'h2':
346 elif tag_thats_done != u'h1':
349 if u'\n' in headingtext:
350 underline = u' ' * self.indentlevel \
351 + underlinechar * (self.textwidth - self.indentlevel)
353 underline = u' ' * self.indentlevel \
354 + underlinechar * len(headingtext)
355 self.text = self.text \
356 + headingtext + u'\n' \
358 elif tag_thats_done in [u'p', u'div']:
359 paragraph = unicode( \
360 " ".join(self.curdata.strip().encode("utf-8").split()), \
362 seperator = u'\n' + u' ' * self.indentlevel
363 self.text = self.text \
364 + u' ' * self.indentlevel \
367 paragraph, self.textwidth - self.indentlevel))
368 elif tag_thats_done == "pre":
369 self.text = self.text + unicode( \
370 self.curdata.encode("utf-8"), "utf-8")
371 elif tag_thats_done == u'blockquote':
373 " ".join(self.curdata.encode("utf-8").strip().split()), \
375 seperator = u'\n' + u' ' * self.indentlevel + u'> '
376 if len(self.text) > 0 and self.text[-1] != u'\n':
377 self.text = self.text + u'\n'
378 self.text = self.text \
383 self.textwidth - self.indentlevel - 2 \
387 elif tag_thats_done == "li":
388 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
389 if len(self.text) > 0 and self.text[-1] != u'\n':
390 self.text = self.text + u'\n'
391 # work out if we're in an ol rather than a ul
392 latesttags = self.opentags[-4:]
395 for thing in latesttags:
409 listmarker = u' %2d. ' %(self.listcount[-1])
410 self.listcount[-1] = self.listcount[-1] + 1
413 + u' ' * self.indentlevel \
415 self.text = self.text \
416 + u' ' * self.indentlevel \
421 self.textwidth - self.indentlevel - listindent \
425 elif tag_thats_done == u'dt':
426 definition = unicode(" ".join( \
427 self.curdata.encode("utf-8").strip().split()), \
429 if len(self.text) > 0 and self.text[-1] != u'\n':
430 self.text = self.text + u'\n\n'
431 elif len(self.text) > 1 and self.text[-2] != u'\n':
432 self.text = self.text + u'\n'
433 definition = u' ' * self.indentlevel + definition + "::"
434 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
435 self.text = self.text \
437 textwrap.wrap(definition, \
438 self.textwidth - self.indentlevel - 1))
440 elif tag_thats_done == u'dd':
441 definition = unicode(" ".join( \
442 self.curdata.encode("utf-8").strip().split()),
444 if len(definition) > 0:
445 if len(self.text) > 0 and self.text[-1] != u'\n':
446 self.text = self.text + u'\n'
447 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
448 self.text = self.text \
449 + u' ' * (self.indentlevel + 4) \
450 + indentstring.join( \
453 self.textwidth - self.indentlevel - 4 \
457 elif tag_thats_done == u'a':
458 self.curdata = self.curdata + u'`__'
460 elif tag_thats_done in self.liststarttags:
463 if tag_thats_done in self.blockleveltags:
466 self.ignorenodata = False
468 def handle_endtag(self, tag):
469 self.ignorenodata = False
474 tagindex = self.opentags.index(tag)
479 if tag in [u'br', u'img']:
482 if tag in self.liststarttags:
483 if tag in [u'ol', u'dl', u'ul']:
484 self.handle_curdata()
485 # find if there was a previous list level
486 smalllist = self.opentags[:-1]
488 for prev_listtag in smalllist:
489 if prev_listtag in [u'ol', u'dl']:
490 self.indentlevel = self.indentlevel - 4
492 elif prev_listtag == u'ul':
493 self.indentlevel = self.indentlevel - 3
497 self.listcount = self.listcount[:-1]
499 while tagindex < len(self.opentags) \
500 and tag in self.opentags[tagindex+1:]:
502 tagindex = self.opentags.index(tag, tagindex+1)
504 # well, we don't want to do that then
506 if tagindex != len(self.opentags) - 1:
507 # Assuming the data was for the last opened tag first
508 self.handle_curdata()
509 # Now kill the list to be a slice before this tag was opened
510 self.opentags = self.opentags[:tagindex + 1]
512 self.handle_curdata()
513 if self.opentags[-1] == tag:
516 def handle_data(self, data):
517 if len(self.opentags) == 0:
518 self.opentags.append(u'p')
519 self.curdata = self.curdata + data.decode("utf-8")
521 def handle_entityref(self, name):
523 if HTML2Text.entities.has_key(name):
524 entity = HTML2Text.entities[name]
526 entity = unichr(int(name[1:]))
528 entity = "&" + name + ";"
530 self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
534 self.handle_curdata()
535 if len(self.text) == 0 or self.text[-1] != u'\n':
536 self.text = self.text + u'\n'
538 if len(self.text) > 0:
539 while len(self.text) > 1 and self.text[-1] == u'\n':
540 self.text = self.text[:-1]
541 self.text = self.text + u'\n'
542 if len(self.urls) > 0:
543 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
545 if len(self.images.keys()) > 0:
546 self.text = self.text + u'\n.. ' \
548 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
549 for a in self.images.keys()]) + u'\n'
553 def open_url(method, url):
555 while redirectcount < 3:
556 (type, rest) = urllib.splittype(url)
557 (host, path) = urllib.splithost(rest)
558 (host, port) = urllib.splitport(host)
562 conn = httplib.HTTPConnection("%s:%s" %(host, port))
563 conn.request(method, path)
564 response = conn.getresponse()
565 if response.status in [301, 302, 303, 307]:
566 headers = response.getheaders()
567 for header in headers:
568 if header[0] == "location":
570 elif response.status == 200:
574 redirectcount = redirectcount + 1
577 def parse_and_deliver(maildir, url, statedir):
580 # first check if we know about this feed already
581 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
582 if feeddb.has_key(url):
584 data = cgi.parse_qs(data)
585 response = open_url("HEAD", url)
588 headers = response.getheaders()
591 for header in headers:
592 if header[0] == "content-length":
593 if header[1] != data["content-length"][0]:
595 elif header[0] == "etag":
596 if header[1] != data["etag"][0]:
598 elif header[0] == "last-modified":
599 if header[1] != data["last-modified"][0]:
601 elif header[0] == "content-md5":
602 if header[1] != data["content-md5"][0]:
607 response = open_url("GET", url)
609 headers = response.getheaders()
610 feedhandle = response
612 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
615 return # don't need to do anything, nothings changed.
617 response = open_url("GET", url)
619 headers = response.getheaders()
620 feedhandle = response
622 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
625 fp = feedparser.parse(feedhandle)
626 db = dbm.open(os.path.join(statedir, "seen"), "c")
627 for item in fp["items"]:
628 # have we seen it before?
629 # need to work out what the content is first...
631 if item.has_key("content"):
632 content = item["content"][0]["value"]
634 content = item["summary"]
636 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
640 # check if there's a guid too - if that exists and we match the md5,
642 if item.has_key("guid"):
643 if db.has_key(url + "|" + item["guid"]):
644 data = db[url + "|" + item["guid"]]
645 data = cgi.parse_qs(data)
646 if data["contentmd5"][0] == md5sum:
649 if db.has_key(url + "|" + item["link"]):
650 data = db[url + "|" + item["link"]]
651 data = cgi.parse_qs(data)
652 if data.has_key("message-id"):
653 prevmessageid = data["message-id"][0]
654 if data["contentmd5"][0] == md5sum:
658 author = item["author"]
662 # create a basic email message
663 msg = MIMEMultipart("alternative")
665 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
669 string.ascii_letters + string.digits \
670 ) for a in range(0,6) \
671 ]) + "@" + socket.gethostname() + ">"
672 msg.add_header("Message-ID", messageid)
673 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
674 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
675 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
677 msg.add_header("References", prevmessageid)
678 createddate = datetime.datetime.now() \
679 .strftime("%a, %e %b %Y %T -0000")
681 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
682 .strftime("%a, %e %b %Y %T -0000")
685 msg.add_header("Date", createddate)
686 subj_gen = HTML2Text()
687 subj_gen.feed(item["title"].encod("utf-8"))
688 msg.add_header("Subject", subj_gen.gettext())
689 msg.set_default_type("text/plain")
691 htmlcontent = content.encode("utf-8")
692 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
696 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
697 textparser = HTML2Text()
698 textparser.feed(content.encode("utf-8"))
699 textcontent = textparser.gettext()
700 textcontent = "%s\n\nItem URL: %s" %( \
703 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
707 # start by working out the filename we should be writting to, we do
708 # this following the normal maildir style rules
709 fname = str(os.getpid()) \
710 + "." + socket.gethostname() \
713 string.ascii_letters + string.digits \
714 ) for a in range(0,10) \
716 + datetime.datetime.now().strftime('%s')
717 fn = os.path.join(maildir, "tmp", fname)
719 fh.write(msg.as_string())
721 # now move it in to the new directory
722 newfn = os.path.join(maildir, "new", fname)
726 # now add to the database about the item
728 messageid = prevmessageid + " " + messageid
729 if item.has_key("guid") and item["guid"] != item["link"]:
730 data = urllib.urlencode(( \
731 ("message-id", messageid), \
732 ("created", createddate), \
733 ("contentmd5", md5sum) \
735 db[url + "|" + item["guid"]] = data
737 data = db[url + "|" + item["link"]]
738 data = cgi.parse_qs(data)
739 newdata = urllib.urlencode(( \
740 ("message-id", messageid), \
741 ("created", data["created"][0]), \
742 ("contentmd5", data["contentmd5"][0]) \
744 db[url + "|" + item["link"]] = newdata
746 db[url + "|" + item["link"]] = data
748 data = urllib.urlencode(( \
749 ("message-id", messageid), \
750 ("created", createddate), \
751 ("contentmd5", md5sum) \
753 db[url + "|" + item["link"]] = data
757 for header in headers:
759 ["content-md5", "etag", "last-modified", "content-length"]:
760 data.append((header[0], header[1]))
762 data = urllib.urlencode(data)
768 if __name__ == "__main__":
769 # This only gets executed if we really called the program
770 # first off, parse the command line arguments
772 oparser = OptionParser()
774 "-c", "--conf", dest="conf",
775 help="location of config file"
778 "-s", "--statedir", dest="statedir",
779 help="location of directory to store state in"
782 (options, args) = oparser.parse_args()
784 # check for the configfile
788 if options.conf != None:
789 # does the file exist?
791 os.stat(options.conf)
792 configfile = options.conf
794 # should exit here as the specified file doesn't exist
796 "Config file %s does not exist. Exiting.\n" %(options.conf,))
799 # check through the default locations
801 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
802 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
805 os.stat("/etc/rss2maildir.conf")
806 configfile = "/etc/rss2maildir.conf"
808 sys.stderr.write("No config file found. Exiting.\n")
811 # Right - if we've got this far, we've got a config file, now for the hard
814 scp = SafeConfigParser()
817 maildir_root = "RSSMaildir"
820 if options.statedir != None:
821 state_dir = options.statedir
823 mode = os.stat(state_dir)[stat.ST_MODE]
824 if not stat.S_ISDIR(mode):
826 "State directory (%s) is not a directory\n" %(state_dir))
829 # try to make the directory
833 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
835 elif scp.has_option("general", "state_dir"):
836 new_state_dir = scp.get("general", "state_dir")
838 mode = os.stat(new_state_dir)[stat.ST_MODE]
839 if not stat.S_ISDIR(mode):
841 "State directory (%s) is not a directory\n" %(state_dir))
844 state_dir = new_state_dir
848 os.mkdir(new_state_dir)
849 state_dir = new_state_dir
852 "Couldn't create state directory %s\n" %(new_state_dir))
856 mode = os.stat(state_dir)[stat.ST_MODE]
857 if not stat.S_ISDIR(mode):
859 "State directory %s is not a directory\n" %(state_dir))
866 "State directory %s could not be created\n" %(state_dir))
869 if scp.has_option("general", "maildir_root"):
870 maildir_root = scp.get("general", "maildir_root")
873 mode = os.stat(maildir_root)[stat.ST_MODE]
874 if not stat.S_ISDIR(mode):
876 "Maildir Root %s is not a directory\n" \
881 os.mkdir(maildir_root)
883 sys.stderr.write("Couldn't create Maildir Root %s\n" \
887 feeds = scp.sections()
889 feeds.remove("general")
893 for section in feeds:
894 # check if the directory exists
897 maildir = scp.get(section, "maildir")
901 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
902 maildir = os.path.join(maildir_root, maildir)
905 exists = os.stat(maildir)
906 if stat.S_ISDIR(exists[stat.ST_MODE]):
907 # check if there's a new, cur and tmp directory
909 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
911 os.mkdir(os.path.join(maildir, "cur"))
912 if not stat.S_ISDIR(mode):
913 sys.stderr.write("Broken maildir: %s\n" %(maildir))
915 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
917 os.mkdir(os.path.join(maildir, "tmp"))
918 if not stat.S_ISDIR(mode):
919 sys.stderr.write("Broken maildir: %s\n" %(maildir))
921 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
922 if not stat.S_ISDIR(mode):
923 sys.stderr.write("Broken maildir: %s\n" %(maildir))
925 os.mkdir(os.path.join(maildir, "new"))
927 sys.stderr.write("Broken maildir: %s\n" %(maildir))
932 sys.stderr.write("Couldn't create root maildir %s\n" \
936 os.mkdir(os.path.join(maildir, "new"))
937 os.mkdir(os.path.join(maildir, "cur"))
938 os.mkdir(os.path.join(maildir, "tmp"))
941 "Couldn't create required maildir directories for %s\n" \
945 # right - we've got the directories, we've got the section, we know the
948 parse_and_deliver(maildir, section, state_dir)