4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
93 def __init__(self,textwidth=70):
96 self.textwidth = textwidth
99 self.ignorenodata = False
102 HTMLParser.__init__(self)
104 def handle_starttag(self, tag, attrs):
105 tag_name = tag.lower()
106 if tag_name in self.blockleveltags:
107 # handle starting a new block - unless we're in a block element
108 # that can contain other blocks, we'll assume that we want to close
110 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
111 self.handle_curdata()
113 if tag_name == u'ol':
114 self.handle_curdata()
115 self.listcount.append(1)
116 self.listlevel = len(self.listcount) - 1
118 if tag_name in self.liststarttags:
119 smallist = self.opentags[-3:-1]
121 for prev_listtag in smallist:
122 if prev_listtag in [u'dl', u'ol']:
123 self.indentlevel = self.indentlevel + 4
125 elif prev_listtag == u'ul':
126 self.indentlevel = self.indentlevel + 3
129 if len(self.opentags) > 0:
130 self.handle_curdata()
131 if tag_name not in self.cancontainflow:
133 self.opentags.append(tag_name)
135 if tag_name == "span":
139 listcount = self.listcount[-1]
143 if tag_name == u'dd' and len(self.opentags) > 1 \
144 and self.opentags[-1] == u'dt':
145 self.handle_curdata()
147 elif tag_name == u'dt' and len(self.opentags) > 1 \
148 and self.opentags[-1] == u'dd':
149 self.handle_curdata()
151 elif tag_name == u'a':
153 if attr[0].lower() == u'href':
154 self.urls.append(attr[1])
155 self.curdata = self.curdata + u'`'
156 self.opentags.append(tag_name)
158 elif tag_name == u'img':
159 self.handle_image(attrs)
161 elif tag_name == u'br':
165 # we don't know the tag, so lets avoid handling it!
168 def handle_startendtag(self, tag, attrs):
169 if tag.lower() == u'br':
171 elif tag.lower() == u'img':
172 self.handle_image(attrs)
176 self.handle_curdata()
177 self.opentags.append(u'br')
178 self.handle_curdata()
181 def handle_image(self, attrs):
186 alt = attr[1].decode('utf-8')
187 elif attr[0] == 'src':
188 url = attr[1].decode('utf-8')
190 self.curdata = self.curdata \
194 self.curdata = self.curdata \
198 self.curdata = self.curdata \
201 def handle_curdata(self):
203 if len(self.opentags) == 0:
206 tag_thats_done = self.opentags[-1]
208 if len(self.curdata) == 0:
211 if tag_thats_done == u'br':
212 if len(self.text) == 0 or self.text[-1] != '\n':
213 self.text = self.text + '\n'
214 self.ignorenodata = True
217 if len(self.curdata.strip()) == 0:
220 if tag_thats_done in self.blockleveltags:
221 newlinerequired = self.text != u''
222 if self.ignorenodata:
223 newlinerequired = False
224 self.ignorenodata = False
226 if tag_thats_done in [u'dt', u'dd', u'li'] \
227 and len(self.text) > 1 \
228 and self.text[-1] != u'\n':
229 self.text = self.text + u'\n'
230 elif len(self.text) > 2 \
231 and self.text[-1] != u'\n' \
232 and self.text[-2] != u'\n':
233 self.text = self.text + u'\n\n'
235 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
238 headingtext = unicode( \
239 self.curdata.encode("utf-8").strip(), "utf-8")
240 seperator = u'\n' + u' '*self.indentlevel
241 headingtext = seperator.join( \
244 self.textwidth - self.indentlevel \
248 if tag_thats_done == u'h2':
250 elif tag_thats_done != u'h1':
253 if u'\n' in headingtext:
254 underline = u' ' * self.indentlevel \
255 + underlinechar * (self.textwidth - self.indentlevel)
257 underline = u' ' * self.indentlevel \
258 + underlinechar * len(headingtext)
259 self.text = self.text \
260 + headingtext.encode("utf-8") + u'\n' \
262 elif tag_thats_done in [u'p', u'div']:
263 paragraph = unicode( \
264 self.curdata.strip().encode("utf-8"), "utf-8")
265 seperator = u'\n' + u' ' * self.indentlevel
266 self.text = self.text \
267 + u' ' * self.indentlevel \
270 paragraph, self.textwidth - self.indentlevel))
271 elif tag_thats_done == "pre":
272 self.text = self.text + unicode( \
273 self.curdata.encode("utf-8"), "utf-8")
274 elif tag_thats_done == u'blockquote':
276 self.curdata.encode("utf-8").strip(), "utf-8")
277 seperator = u'\n' + u' ' * self.indentlevel + u'> '
278 if len(self.text) > 0 and self.text[-1] != u'\n':
279 self.text = self.text + u'\n'
280 self.text = self.text \
285 self.textwidth - self.indentlevel - 2 \
289 elif tag_thats_done == "li":
290 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
291 if len(self.text) > 0 and self.text[-1] != u'\n':
292 self.text = self.text + u'\n'
293 # work out if we're in an ol rather than a ul
294 latesttags = self.opentags[-4:]
297 for thing in latesttags:
311 listmarker = u' %2d. ' %(self.listcount[-1])
312 self.listcount[-1] = self.listcount[-1] + 1
315 + u' ' * self.indentlevel \
317 self.text = self.text \
318 + u' ' * self.indentlevel \
323 self.textwidth - self.indentlevel - listindent \
327 elif tag_thats_done == u'dt':
328 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
329 if len(self.text) > 0 and self.text[-1] != u'\n':
330 self.text = self.text + u'\n\n'
331 elif len(self.text) > 1 and self.text[-2] != u'\n':
332 self.text = self.text + u'\n'
333 definition = u' ' * self.indentlevel + definition + "::"
334 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
335 self.text = self.text \
337 textwrap.wrap(definition, \
338 self.textwidth - self.indentlevel - 1))
340 elif tag_thats_done == u'dd':
341 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
342 if len(definition) > 0:
343 if len(self.text) > 0 and self.text[-1] != u'\n':
344 self.text = self.text + u'\n'
345 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
346 self.text = self.text \
347 + u' ' * (self.indentlevel + 4) \
348 + indentstring.join( \
351 self.textwidth - self.indentlevel - 4 \
355 elif tag_thats_done == u'a':
356 self.curdata = self.curdata + u'`__'
358 elif tag_thats_done in self.liststarttags:
361 if tag_thats_done in self.blockleveltags:
364 self.ignorenodata = False
366 def handle_endtag(self, tag):
367 self.ignorenodata = False
372 tagindex = self.opentags.index(tag)
377 if tag in [u'br', u'img']:
380 if tag in self.liststarttags:
381 if tag in [u'ol', u'dl', u'ul']:
382 self.handle_curdata()
383 # find if there was a previous list level
384 smalllist = self.opentags[:-1]
386 for prev_listtag in smalllist:
387 if prev_listtag in [u'ol', u'dl']:
388 self.indentlevel = self.indentlevel - 4
390 elif prev_listtag == u'ul':
391 self.indentlevel = self.indentlevel - 3
395 self.listcount = self.listcount[:-1]
397 while tagindex < len(self.opentags) \
398 and tag in self.opentags[tagindex+1:]:
400 tagindex = self.opentags.index(tag, tagindex+1)
402 # well, we don't want to do that then
404 if tagindex != len(self.opentags) - 1:
405 # Assuming the data was for the last opened tag first
406 self.handle_curdata()
407 # Now kill the list to be a slice before this tag was opened
408 self.opentags = self.opentags[:tagindex + 1]
410 self.handle_curdata()
411 if self.opentags[-1] == tag:
414 def handle_data(self, data):
415 if len(self.opentags) == 0:
416 self.opentags.append(u'p')
417 self.curdata = self.curdata + unicode(data, "utf-8")
419 def handle_entityref(self, name):
421 if HTML2Text.entities.has_key(name.lower()):
422 entity = HTML2Text.entities[name.lower()]
424 entity = unichr(int(name[1:]))
426 entity = "&" + name + ";"
428 self.curdata = self.curdata + unicode(entity, "utf-8")
431 self.handle_curdata()
432 if len(self.text) == 0 or self.text[-1] != u'\n':
433 self.text = self.text + u'\n'
435 if len(self.text) > 0:
436 while len(self.text) > 1 and self.text[-1] == u'\n':
437 self.text = self.text[:-1]
438 self.text = self.text + u'\n'
439 if len(self.urls) > 0:
440 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
444 def open_url(method, url):
446 while redirectcount < 3:
447 (type, rest) = urllib.splittype(url)
448 (host, path) = urllib.splithost(rest)
449 (host, port) = urllib.splitport(host)
453 conn = httplib.HTTPConnection("%s:%s" %(host, port))
454 conn.request(method, path)
455 response = conn.getresponse()
456 if response.status in [301, 302, 303, 307]:
457 headers = response.getheaders()
458 for header in headers:
459 if header[0] == "location":
461 elif response.status == 200:
465 redirectcount = redirectcount + 1
468 def parse_and_deliver(maildir, url, statedir):
471 # first check if we know about this feed already
472 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
473 if feeddb.has_key(url):
475 data = cgi.parse_qs(data)
476 response = open_url("HEAD", url)
479 headers = response.getheaders()
482 for header in headers:
483 if header[0] == "content-length":
484 if header[1] != data["content-length"][0]:
486 elif header[0] == "etag":
487 if header[1] != data["etag"][0]:
489 elif header[0] == "last-modified":
490 if header[1] != data["last-modified"][0]:
492 elif header[0] == "content-md5":
493 if header[1] != data["content-md5"][0]:
498 response = open_url("GET", url)
500 headers = response.getheaders()
501 feedhandle = response
503 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
506 return # don't need to do anything, nothings changed.
508 response = open_url("GET", url)
510 headers = response.getheaders()
511 feedhandle = response
513 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
516 fp = feedparser.parse(feedhandle)
517 db = dbm.open(os.path.join(statedir, "seen"), "c")
518 for item in fp["items"]:
519 # have we seen it before?
520 # need to work out what the content is first...
522 if item.has_key("content"):
523 content = item["content"][0]["value"]
525 content = item["summary"]
527 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
531 # check if there's a guid too - if that exists and we match the md5,
533 if item.has_key("guid"):
534 if db.has_key(url + "|" + item["guid"]):
535 data = db[url + "|" + item["guid"]]
536 data = cgi.parse_qs(data)
537 if data["contentmd5"][0] == md5sum:
540 if db.has_key(url + "|" + item["link"]):
541 data = db[url + "|" + item["link"]]
542 data = cgi.parse_qs(data)
543 if data.has_key("message-id"):
544 prevmessageid = data["message-id"][0]
545 if data["contentmd5"][0] == md5sum:
549 author = item["author"]
553 # create a basic email message
554 msg = MIMEMultipart("alternative")
556 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
560 string.ascii_letters + string.digits \
561 ) for a in range(0,6) \
562 ]) + "@" + socket.gethostname() + ">"
563 msg.add_header("Message-ID", messageid)
564 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
565 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
566 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
568 msg.add_header("References", prevmessageid)
569 createddate = datetime.datetime.now() \
570 .strftime("%a, %e %b %Y %T -0000")
572 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
573 .strftime("%a, %e %b %Y %T -0000")
576 msg.add_header("Date", createddate)
577 msg.add_header("Subject", item["title"])
578 msg.set_default_type("text/plain")
580 htmlcontent = content.encode("utf-8")
581 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
585 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
586 textparser = HTML2Text()
587 textparser.feed(content.encode("utf-8"))
588 textcontent = textparser.gettext()
589 textcontent = "%s\n\nItem URL: %s" %( \
592 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
596 # start by working out the filename we should be writting to, we do
597 # this following the normal maildir style rules
598 fname = str(os.getpid()) \
599 + "." + socket.gethostname() \
602 string.ascii_letters + string.digits \
603 ) for a in range(0,10) \
605 + datetime.datetime.now().strftime('%s')
606 fn = os.path.join(maildir, "tmp", fname)
608 fh.write(msg.as_string())
610 # now move it in to the new directory
611 newfn = os.path.join(maildir, "new", fname)
615 # now add to the database about the item
617 messageid = prevmessageid + " " + messageid
618 if item.has_key("guid") and item["guid"] != item["link"]:
619 data = urllib.urlencode(( \
620 ("message-id", messageid), \
621 ("created", createddate), \
622 ("contentmd5", md5sum) \
624 db[url + "|" + item["guid"]] = data
626 data = db[url + "|" + item["link"]]
627 data = cgi.parse_qs(data)
628 newdata = urllib.urlencode(( \
629 ("message-id", messageid), \
630 ("created", data["created"][0]), \
631 ("contentmd5", data["contentmd5"][0]) \
633 db[url + "|" + item["link"]] = newdata
635 db[url + "|" + item["link"]] = data
637 data = urllib.urlencode(( \
638 ("message-id", messageid), \
639 ("created", createddate), \
640 ("contentmd5", md5sum) \
642 db[url + "|" + item["link"]] = data
646 for header in headers:
648 ["content-md5", "etag", "last-modified", "content-length"]:
649 data.append((header[0], header[1]))
651 data = urllib.urlencode(data)
657 if __name__ == "__main__":
658 # This only gets executed if we really called the program
659 # first off, parse the command line arguments
661 oparser = OptionParser()
663 "-c", "--conf", dest="conf",
664 help="location of config file"
667 "-s", "--statedir", dest="statedir",
668 help="location of directory to store state in"
671 (options, args) = oparser.parse_args()
673 # check for the configfile
677 if options.conf != None:
678 # does the file exist?
680 os.stat(options.conf)
681 configfile = options.conf
683 # should exit here as the specified file doesn't exist
685 "Config file %s does not exist. Exiting.\n" %(options.conf,))
688 # check through the default locations
690 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
691 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
694 os.stat("/etc/rss2maildir.conf")
695 configfile = "/etc/rss2maildir.conf"
697 sys.stderr.write("No config file found. Exiting.\n")
700 # Right - if we've got this far, we've got a config file, now for the hard
703 scp = SafeConfigParser()
706 maildir_root = "RSSMaildir"
709 if options.statedir != None:
710 state_dir = options.statedir
712 mode = os.stat(state_dir)[stat.ST_MODE]
713 if not stat.S_ISDIR(mode):
715 "State directory (%s) is not a directory\n" %(state_dir))
718 # try to make the directory
722 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
724 elif scp.has_option("general", "state_dir"):
725 new_state_dir = scp.get("general", "state_dir")
727 mode = os.stat(state_dir)[stat.ST_MODE]
728 if not stat.S_ISDIR(mode):
730 "State directory (%s) is not a directory\n" %(state_dir))
735 os.mkdir(new_state_dir)
736 state_dir = new_state_dir
739 "Couldn't create state directory %s\n" %(new_state_dir))
743 mode = os.stat(state_dir)[stat.ST_MODE]
744 if not stat.S_ISDIR(mode):
746 "State directory %s is not a directory\n" %(state_dir))
753 "State directory %s could not be created\n" %(state_dir))
756 if scp.has_option("general", "maildir_root"):
757 maildir_root = scp.get("general", "maildir_root")
760 mode = os.stat(maildir_root)[stat.ST_MODE]
761 if not stat.S_ISDIR(mode):
763 "Maildir Root %s is not a directory\n" \
768 os.mkdir(maildir_root)
770 sys.stderr.write("Couldn't create Maildir Root %s\n" \
774 feeds = scp.sections()
776 feeds.remove("general")
780 for section in feeds:
781 # check if the directory exists
784 maildir = scp.get(section, "maildir")
788 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
789 maildir = os.path.join(maildir_root, maildir)
792 exists = os.stat(maildir)
793 if stat.S_ISDIR(exists[stat.ST_MODE]):
794 # check if there's a new, cur and tmp directory
796 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
798 os.mkdir(os.path.join(maildir, "cur"))
799 if not stat.S_ISDIR(mode):
800 sys.stderr.write("Broken maildir: %s\n" %(maildir))
802 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
804 os.mkdir(os.path.join(maildir, "tmp"))
805 if not stat.S_ISDIR(mode):
806 sys.stderr.write("Broken maildir: %s\n" %(maildir))
808 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
809 if not stat.S_ISDIR(mode):
810 sys.stderr.write("Broken maildir: %s\n" %(maildir))
812 os.mkdir(os.path.join(maildir, "new"))
814 sys.stderr.write("Broken maildir: %s\n" %(maildir))
819 sys.stderr.write("Couldn't create root maildir %s\n" \
823 os.mkdir(os.path.join(maildir, "new"))
824 os.mkdir(os.path.join(maildir, "cur"))
825 os.mkdir(os.path.join(maildir, "tmp"))
828 "Couldn't create required maildir directories for %s\n" \
832 # right - we've got the directories, we've got the section, we know the
835 parse_and_deliver(maildir, section, state_dir)