4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
90 def __init__(self,textwidth=70):
93 self.textwidth = textwidth
96 self.ignorenodata = False
99 HTMLParser.__init__(self)
101 def handle_starttag(self, tag, attrs):
102 tag_name = tag.lower()
103 if tag_name in self.blockleveltags:
104 # handle starting a new block - unless we're in a block element
105 # that can contain other blocks, we'll assume that we want to close
107 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
108 self.handle_curdata()
110 if tag_name == u'ol':
111 self.handle_curdata()
112 self.listcount.append(1)
113 self.listlevel = len(self.listcount) - 1
115 if tag_name in self.liststarttags:
116 smallist = self.opentags[-3:-1]
118 for prev_listtag in smallist:
119 if prev_listtag in [u'dl', u'ol']:
120 self.indentlevel = self.indentlevel + 4
122 elif prev_listtag == u'ul':
123 self.indentlevel = self.indentlevel + 3
126 if len(self.opentags) > 0:
127 self.handle_curdata()
128 if tag_name not in self.cancontainflow:
130 self.opentags.append(tag_name)
132 if tag_name == "span":
136 listcount = self.listcount[-1]
140 if tag_name == u'dd' and len(self.opentags) > 1 \
141 and self.opentags[-1] == u'dt':
142 self.handle_curdata()
144 elif tag_name == u'dt' and len(self.opentags) > 1 \
145 and self.opentags[-1] == u'dd':
146 self.handle_curdata()
148 elif tag_name == u'a':
150 if attr[0].lower() == u'href':
151 self.urls.append(attr[1])
152 self.curdata = self.curdata + u'`'
153 self.opentags.append(tag_name)
155 elif tag_name == u'img':
156 self.handle_image(attrs)
158 elif tag_name == u'br':
162 # we don't know the tag, so lets avoid handling it!
165 def handle_startendtag(self, tag, attrs):
166 if tag.lower() == u'br':
168 elif tag.lower() == u'img':
169 self.handle_image(attrs)
173 self.handle_curdata()
174 self.opentags.append(u'br')
175 self.handle_curdata()
178 def handle_image(self, attrs):
184 elif attr[0] == 'src':
187 self.curdata = self.curdata \
190 url.encode('utf-8'), \
193 self.curdata = self.curdata \
196 alt.encode('utf-8'), \
199 self.curdata = self.curdata \
202 def handle_curdata(self):
204 if len(self.opentags) == 0:
207 tag_thats_done = self.opentags[-1]
209 if len(self.curdata) == 0:
212 if tag_thats_done == u'br':
213 if len(self.text) == 0 or self.text[-1] != '\n':
214 self.text = self.text + '\n'
215 self.ignorenodata = True
218 if len(self.curdata.strip()) == 0:
221 if tag_thats_done in self.blockleveltags:
222 newlinerequired = self.text != u''
223 if self.ignorenodata:
224 newlinerequired = False
225 self.ignorenodata = False
227 and len(self.text) > 2 \
228 and self.text[-1] != u'\n' \
229 and self.text[-2] != u'\n':
230 self.text = self.text + u'\n\n'
232 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
235 headingtext = unicode( \
236 self.curdata.encode("utf-8").strip(), "utf-8")
237 seperator = u'\n' + u' '*self.indentlevel
238 headingtext = seperator.join( \
241 self.textwidth - self.indentlevel \
245 if tag_thats_done == u'h2':
247 elif tag_thats_done != u'h1':
250 if u'\n' in headingtext:
251 underline = u' ' * self.indentlevel \
252 + underlinechar * (self.textwidth - self.indentlevel)
254 underline = u' ' * self.indentlevel \
255 + underlinechar * len(headingtext)
256 self.text = self.text \
257 + headingtext.encode("utf-8") + u'\n' \
259 elif tag_thats_done in [u'p', u'div']:
260 paragraph = unicode( \
261 self.curdata.strip().encode("utf-8"), "utf-8")
262 seperator = u'\n' + u' ' * self.indentlevel
263 self.text = self.text \
264 + u' ' * self.indentlevel \
267 paragraph, self.textwidth - self.indentlevel))
268 elif tag_thats_done == "pre":
269 self.text = self.text + unicode( \
270 self.curdata.encode("utf-8"), "utf-8")
271 elif tag_thats_done == u'blockquote':
273 self.curdata.encode("utf-8").strip(), "utf-8")
274 seperator = u'\n' + u' ' * self.indentlevel + u'> '
275 if len(self.text) > 0 and self.text[-1] != u'\n':
276 self.text = self.text + u'\n'
277 self.text = self.text \
282 self.textwidth - self.indentlevel - 2 \
286 elif tag_thats_done == "li":
287 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
288 if len(self.text) > 0 and self.text[-1] != u'\n':
289 self.text = self.text + u'\n'
290 # work out if we're in an ol rather than a ul
291 latesttags = self.opentags[-4:]
294 for thing in latesttags:
308 listmarker = u' %2d. ' %(self.listcount[-1])
309 self.listcount[-1] = self.listcount[-1] + 1
312 + u' ' * self.indentlevel \
314 self.text = self.text \
315 + u' ' * self.indentlevel \
320 self.textwidth - self.indentlevel - listindent \
324 elif tag_thats_done == u'dt':
325 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
326 if len(self.text) > 0 and self.text[-1] != u'\n':
327 self.text = self.text + u'\n\n'
328 elif len(self.text) > 1 and self.text[-2] != u'\n':
329 self.text = self.text + u'\n'
330 definition = u' ' * self.indentlevel + definition + "::"
331 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
332 self.text = self.text \
334 textwrap.wrap(definition, \
335 self.textwidth - self.indentlevel - 1))
337 elif tag_thats_done == u'dd':
338 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
339 if len(definition) > 0:
340 if len(self.text) > 0 and self.text[-1] != u'\n':
341 self.text = self.text + u'\n'
342 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
343 self.text = self.text \
344 + u' ' * (self.indentlevel + 4) \
345 + indentstring.join( \
348 self.textwidth - self.indentlevel - 4 \
352 elif tag_thats_done == u'a':
353 self.curdata = self.curdata + u'`__'
355 elif tag_thats_done in self.liststarttags:
358 if tag_thats_done in self.blockleveltags:
361 self.ignorenodata = False
363 def handle_endtag(self, tag):
364 self.ignorenodata = False
369 tagindex = self.opentags.index(tag)
374 if tag in [u'br', u'img']:
377 if tag in self.liststarttags:
378 if tag in [u'ol', u'dl', u'ul']:
379 self.handle_curdata()
380 # find if there was a previous list level
381 smalllist = self.opentags[:-1]
383 for prev_listtag in smalllist:
384 if prev_listtag in [u'ol', u'dl']:
385 self.indentlevel = self.indentlevel - 4
387 elif prev_listtag == u'ul':
388 self.indentlevel = self.indentlevel - 3
392 self.listcount = self.listcount[:-1]
394 while tagindex < len(self.opentags) \
395 and tag in self.opentags[tagindex+1:]:
397 tagindex = self.opentags.index(tag, tagindex+1)
399 # well, we don't want to do that then
401 if tagindex != len(self.opentags) - 1:
402 # Assuming the data was for the last opened tag first
403 self.handle_curdata()
404 # Now kill the list to be a slice before this tag was opened
405 self.opentags = self.opentags[:tagindex + 1]
407 self.handle_curdata()
408 if self.opentags[-1] == tag:
411 def handle_data(self, data):
412 if len(self.opentags) == 0:
413 self.opentags.append(u'p')
414 self.curdata = self.curdata + unicode(data, "utf-8")
416 def handle_entityref(self, name):
418 if HTML2Text.entities.has_key(name.lower()):
419 entity = HTML2Text.entities[name.lower()]
421 entity = unichr(int(name[1:]))
423 entity = "&" + name + ";"
425 self.curdata = self.curdata + unicode(entity, "utf-8")
428 self.handle_curdata()
429 if len(self.text) == 0 or self.text[-1] != u'\n':
430 self.text = self.text + u'\n'
432 if len(self.text) > 0:
433 while len(self.text) > 1 and self.text[-1] == u'\n':
434 self.text = self.text[:-1]
435 self.text = self.text + u'\n'
436 if len(self.urls) > 0:
437 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
441 def open_url(method, url):
443 while redirectcount < 3:
444 (type, rest) = urllib.splittype(url)
445 (host, path) = urllib.splithost(rest)
446 (host, port) = urllib.splitport(host)
450 conn = httplib.HTTPConnection("%s:%s" %(host, port))
451 conn.request(method, path)
452 response = conn.getresponse()
453 if response.status in [301, 302, 303, 307]:
454 headers = response.getheaders()
455 for header in headers:
456 if header[0] == "location":
458 elif response.status == 200:
462 redirectcount = redirectcount + 1
465 def parse_and_deliver(maildir, url, statedir):
468 # first check if we know about this feed already
469 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
470 if feeddb.has_key(url):
472 data = cgi.parse_qs(data)
473 response = open_url("HEAD", url)
476 headers = response.getheaders()
479 for header in headers:
480 if header[0] == "content-length":
481 if header[1] != data["content-length"][0]:
483 elif header[0] == "etag":
484 if header[1] != data["etag"][0]:
486 elif header[0] == "last-modified":
487 if header[1] != data["last-modified"][0]:
489 elif header[0] == "content-md5":
490 if header[1] != data["content-md5"][0]:
495 response = open_url("GET", url)
497 headers = response.getheaders()
498 feedhandle = response
500 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
503 return # don't need to do anything, nothings changed.
505 response = open_url("GET", url)
507 headers = response.getheaders()
508 feedhandle = response
510 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
513 fp = feedparser.parse(feedhandle)
514 db = dbm.open(os.path.join(statedir, "seen"), "c")
515 for item in fp["items"]:
516 # have we seen it before?
517 # need to work out what the content is first...
519 if item.has_key("content"):
520 content = item["content"][0]["value"]
522 content = item["summary"]
524 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
528 # check if there's a guid too - if that exists and we match the md5,
530 if item.has_key("guid"):
531 if db.has_key(url + "|" + item["guid"]):
532 data = db[url + "|" + item["guid"]]
533 data = cgi.parse_qs(data)
534 if data["contentmd5"][0] == md5sum:
537 if db.has_key(url + "|" + item["link"]):
538 data = db[url + "|" + item["link"]]
539 data = cgi.parse_qs(data)
540 if data.has_key("message-id"):
541 prevmessageid = data["message-id"][0]
542 if data["contentmd5"][0] == md5sum:
546 author = item["author"]
550 # create a basic email message
551 msg = MIMEMultipart("alternative")
553 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
557 string.ascii_letters + string.digits \
558 ) for a in range(0,6) \
559 ]) + "@" + socket.gethostname() + ">"
560 msg.add_header("Message-ID", messageid)
561 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
562 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
563 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
565 msg.add_header("References", prevmessageid)
566 createddate = datetime.datetime.now() \
567 .strftime("%a, %e %b %Y %T -0000")
569 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
570 .strftime("%a, %e %b %Y %T -0000")
573 msg.add_header("Date", createddate)
574 msg.add_header("Subject", item["title"])
575 msg.set_default_type("text/plain")
577 htmlcontent = content.encode("utf-8")
578 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
582 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
583 textparser = HTML2Text()
584 textparser.feed(content.encode("utf-8"))
585 textcontent = textparser.gettext()
586 textcontent = "%s\n\nItem URL: %s" %( \
589 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
593 # start by working out the filename we should be writting to, we do
594 # this following the normal maildir style rules
595 fname = str(os.getpid()) \
596 + "." + socket.gethostname() \
599 string.ascii_letters + string.digits \
600 ) for a in range(0,10) \
602 + datetime.datetime.now().strftime('%s')
603 fn = os.path.join(maildir, "tmp", fname)
605 fh.write(msg.as_string())
607 # now move it in to the new directory
608 newfn = os.path.join(maildir, "new", fname)
612 # now add to the database about the item
614 messageid = prevmessageid + " " + messageid
615 if item.has_key("guid") and item["guid"] != item["link"]:
616 data = urllib.urlencode(( \
617 ("message-id", messageid), \
618 ("created", createddate), \
619 ("contentmd5", md5sum) \
621 db[url + "|" + item["guid"]] = data
623 data = db[url + "|" + item["link"]]
624 data = cgi.parse_qs(data)
625 newdata = urllib.urlencode(( \
626 ("message-id", messageid), \
627 ("created", data["created"][0]), \
628 ("contentmd5", data["contentmd5"][0]) \
630 db[url + "|" + item["link"]] = newdata
632 db[url + "|" + item["link"]] = data
634 data = urllib.urlencode(( \
635 ("message-id", messageid), \
636 ("created", createddate), \
637 ("contentmd5", md5sum) \
639 db[url + "|" + item["link"]] = data
643 for header in headers:
645 ["content-md5", "etag", "last-modified", "content-length"]:
646 data.append((header[0], header[1]))
648 data = urllib.urlencode(data)
654 if __name__ == "__main__":
655 # This only gets executed if we really called the program
656 # first off, parse the command line arguments
658 oparser = OptionParser()
660 "-c", "--conf", dest="conf",
661 help="location of config file"
664 "-s", "--statedir", dest="statedir",
665 help="location of directory to store state in"
668 (options, args) = oparser.parse_args()
670 # check for the configfile
674 if options.conf != None:
675 # does the file exist?
677 os.stat(options.conf)
678 configfile = options.conf
680 # should exit here as the specified file doesn't exist
682 "Config file %s does not exist. Exiting.\n" %(options.conf,))
685 # check through the default locations
687 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
688 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
691 os.stat("/etc/rss2maildir.conf")
692 configfile = "/etc/rss2maildir.conf"
694 sys.stderr.write("No config file found. Exiting.\n")
697 # Right - if we've got this far, we've got a config file, now for the hard
700 scp = SafeConfigParser()
703 maildir_root = "RSSMaildir"
706 if options.statedir != None:
707 state_dir = options.statedir
709 mode = os.stat(state_dir)[stat.ST_MODE]
710 if not stat.S_ISDIR(mode):
712 "State directory (%s) is not a directory\n" %(state_dir))
715 # try to make the directory
719 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
721 elif scp.has_option("general", "state_dir"):
722 new_state_dir = scp.get("general", "state_dir")
724 mode = os.stat(state_dir)[stat.ST_MODE]
725 if not stat.S_ISDIR(mode):
727 "State directory (%s) is not a directory\n" %(state_dir))
732 os.mkdir(new_state_dir)
733 state_dir = new_state_dir
736 "Couldn't create state directory %s\n" %(new_state_dir))
740 mode = os.stat(state_dir)[stat.ST_MODE]
741 if not stat.S_ISDIR(mode):
743 "State directory %s is not a directory\n" %(state_dir))
750 "State directory %s could not be created\n" %(state_dir))
753 if scp.has_option("general", "maildir_root"):
754 maildir_root = scp.get("general", "maildir_root")
757 mode = os.stat(maildir_root)[stat.ST_MODE]
758 if not stat.S_ISDIR(mode):
760 "Maildir Root %s is not a directory\n" \
765 os.mkdir(maildir_root)
767 sys.stderr.write("Couldn't create Maildir Root %s\n" \
771 feeds = scp.sections()
773 feeds.remove("general")
777 for section in feeds:
778 # check if the directory exists
781 maildir = scp.get(section, "maildir")
785 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
786 maildir = os.path.join(maildir_root, maildir)
789 exists = os.stat(maildir)
790 if stat.S_ISDIR(exists[stat.ST_MODE]):
791 # check if there's a new, cur and tmp directory
793 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
795 os.mkdir(os.path.join(maildir, "cur"))
796 if not stat.S_ISDIR(mode):
797 sys.stderr.write("Broken maildir: %s\n" %(maildir))
799 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
801 os.mkdir(os.path.join(maildir, "tmp"))
802 if not stat.S_ISDIR(mode):
803 sys.stderr.write("Broken maildir: %s\n" %(maildir))
805 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
806 if not stat.S_ISDIR(mode):
807 sys.stderr.write("Broken maildir: %s\n" %(maildir))
809 os.mkdir(os.path.join(maildir, "new"))
811 sys.stderr.write("Broken maildir: %s\n" %(maildir))
816 sys.stderr.write("Couldn't create root maildir %s\n" \
820 os.mkdir(os.path.join(maildir, "new"))
821 os.mkdir(os.path.join(maildir, "cur"))
822 os.mkdir(os.path.join(maildir, "tmp"))
825 "Couldn't create required maildir directories for %s\n" \
829 # right - we've got the directories, we've got the section, we know the
832 parse_and_deliver(maildir, section, state_dir)