4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
97 HTMLParser.__init__(self)
99 def handle_starttag(self, tag, attrs):
100 tag_name = tag.lower()
101 if tag_name in self.blockleveltags:
102 # handle starting a new block - unless we're in a block element
103 # that can contain other blocks, we'll assume that we want to close
105 if tag_name == u'br':
106 self.handle_curdata()
107 self.opentags.append(tag_name)
110 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
111 self.handle_curdata()
113 if tag_name == u'ol':
114 self.handle_curdata()
115 self.listcount.append(1)
116 self.listlevel = len(self.listcount) - 1
118 if tag_name in self.liststarttags:
119 smallist = self.opentags[-3:-1]
121 for prev_listtag in smallist:
122 if prev_listtag in [u'dl', u'ol']:
123 self.indentlevel = self.indentlevel + 4
125 elif prev_listtag == u'ul':
126 self.indentlevel = self.indentlevel + 3
129 if len(self.opentags) > 0:
130 self.handle_curdata()
131 if tag_name not in self.cancontainflow:
133 self.opentags.append(tag_name)
137 listcount = self.listcount[-1]
141 if tag_name == u'dd' and len(self.opentags) > 1 \
142 and self.opentags[-1] == u'dt':
143 self.handle_curdata()
145 elif tag_name == u'dt' and len(self.opentags) > 1 \
146 and self.opentags[-1] == u'dd':
147 self.handle_curdata()
149 elif tag_name == u'a':
151 if attr[0].lower() == u'href':
152 self.urls.append(attr[1])
153 self.curdata = self.curdata + u'`'
154 self.opentags.append(tag_name)
157 self.handle_curdata()
158 self.opentags.append(tag_name)
160 def handle_startendtag(self, tag, attrs):
161 if tag.lower() == u'br':
162 self.opentags.append(u'br')
163 self.handle_curdata() # just handle the data, don't do anything else
166 def handle_curdata(self):
167 if len(self.opentags) == 0:
170 if len(self.curdata) == 0:
173 if len(self.curdata.strip()) == 0:
176 tag_thats_done = self.opentags[-1]
178 if tag_thats_done in self.blockleveltags:
179 newlinerequired = self.text != u''
182 and len(self.text) > 2 \
183 and self.text[-1] != u'\n' \
184 and self.text[-2] != u'\n':
185 self.text = self.text + u'\n\n'
187 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
190 headingtext = unicode( \
191 self.curdata.encode("utf-8").strip(), "utf-8")
192 seperator = u'\n' + u' '*self.indentlevel
193 headingtext = seperator.join( \
196 self.textwidth - self.indentlevel \
200 if tag_thats_done == u'h2':
202 elif tag_thats_done != u'h1':
205 if u'\n' in headingtext:
206 underline = u' ' * self.indentlevel \
207 + underlinechar * (self.textwidth - self.indentlevel)
209 underline = u' ' * self.indentlevel \
210 + underlinechar * len(headingtext)
211 self.text = self.text \
212 + headingtext.encode("utf-8") + u'\n' \
214 elif tag_thats_done == u'p':
215 paragraph = unicode( \
216 self.curdata.strip().encode("utf-8"), "utf-8")
217 seperator = u'\n' + u' ' * self.indentlevel
218 self.text = self.text \
219 + u' ' * self.indentlevel \
222 paragraph, self.textwidth - self.indentlevel))
223 elif tag_thats_done == "pre":
224 self.text = self.text + unicode( \
225 self.curdata.encode("utf-8"), "utf-8")
226 elif tag_thats_done == "blockquote":
228 self.curdata.encode("utf-8").strip(), "utf-8")
229 seperator = u'\n' + u' ' * self.indentlevel + u'> '
230 self.text = self.text \
235 self.textwidth - self.indentlevel - 2 \
238 elif tag_thats_done == "li":
239 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
240 if len(self.text) > 0 and self.text[-1] != u'\n':
241 self.text = self.text + u'\n'
242 # work out if we're in an ol rather than a ul
243 latesttags = self.opentags[-4:]
246 for thing in latesttags:
260 listmarker = u' %2d. ' %(self.listcount[-1])
261 self.listcount[-1] = self.listcount[-1] + 1
264 + u' ' * self.indentlevel \
266 self.text = self.text \
267 + u' ' * self.indentlevel \
272 self.textwidth - self.indentlevel - listindent \
276 elif tag_thats_done == u'dt':
277 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
278 if len(self.text) > 0 and self.text[-1] != u'\n':
279 self.text = self.text + u'\n\n'
280 elif len(self.text) > 1 and self.text[-2] != u'\n':
281 self.text = self.text + u'\n'
282 definition = u' ' * self.indentlevel + definition + "::"
283 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
284 self.text = self.text \
286 textwrap.wrap(definition, \
287 self.textwidth - self.indentlevel - 1))
289 elif tag_thats_done == u'dd':
290 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
291 if len(definition) > 0:
292 if len(self.text) > 0 and self.text[-1] != u'\n':
293 self.text = self.text + u'\n'
294 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
295 self.text = self.text \
296 + u' ' * (self.indentlevel + 4) \
297 + indentstring.join( \
300 self.textwidth - self.indentlevel - 4 \
304 elif tag_thats_done == u'a':
305 self.curdata = self.curdata + u'`__'
307 elif tag_thats_done in self.liststarttags:
310 # we've got no idea what this tag does, so we'll
311 # make an assumption that we're not going to know later
312 if len(self.curdata) > 0:
313 self.text = self.text \
318 self.curdata.encode("utf-8").strip(), \
319 "utf-8"), self.textwidth - 5))
322 if tag_thats_done in self.blockleveltags:
325 def handle_endtag(self, tag):
327 tagindex = self.opentags.index(tag)
329 # closing tag we know nothing about.
335 if tag in self.liststarttags:
336 if tag in [u'ol', u'dl', u'ul']:
337 self.handle_curdata()
338 # find if there was a previous list level
339 smalllist = self.opentags[:-1]
341 for prev_listtag in smalllist:
342 if prev_listtag in [u'ol', u'dl']:
343 self.indentlevel = self.indentlevel - 4
345 elif prev_listtag == u'ul':
346 self.indentlevel = self.indentlevel - 3
350 self.listcount = self.listcount[:-1]
352 while tagindex < len(self.opentags) \
353 and tag in self.opentags[tagindex+1:]:
355 tagindex = self.opentags.index(tag, tagindex+1)
357 # well, we don't want to do that then
359 if tagindex != len(self.opentags) - 1:
360 # Assuming the data was for the last opened tag first
361 self.handle_curdata()
362 # Now kill the list to be a slice before this tag was opened
363 self.opentags = self.opentags[:tagindex + 1]
365 self.handle_curdata()
366 if self.opentags[-1] == tag:
369 def handle_data(self, data):
370 self.curdata = self.curdata + unicode(data, "utf-8")
372 def handle_entityref(self, name):
374 if HTML2Text.entities.has_key(name.lower()):
375 entity = HTML2Text.entities[name.lower()]
377 entity = unichr(int(name[1:]))
379 entity = "&" + name + ";"
381 self.curdata = self.curdata + unicode(entity, "utf-8")
384 self.handle_curdata()
385 if len(self.text) == 0 or self.text[-1] != u'\n':
386 self.text = self.text + u'\n'
388 if len(self.text) > 0:
389 while len(self.text) > 1 and self.text[-1] == u'\n':
390 self.text = self.text[:-1]
391 self.text = self.text + u'\n'
392 if len(self.urls) > 0:
393 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
397 def open_url(method, url):
399 while redirectcount < 3:
400 (type, rest) = urllib.splittype(url)
401 (host, path) = urllib.splithost(rest)
402 (host, port) = urllib.splitport(host)
406 conn = httplib.HTTPConnection("%s:%s" %(host, port))
407 conn.request(method, path)
408 response = conn.getresponse()
409 if response.status in [301, 302, 303, 307]:
410 headers = response.getheaders()
411 for header in headers:
412 if header[0] == "location":
414 elif response.status == 200:
418 redirectcount = redirectcount + 1
421 def parse_and_deliver(maildir, url, statedir):
424 # first check if we know about this feed already
425 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
426 if feeddb.has_key(url):
428 data = cgi.parse_qs(data)
429 response = open_url("HEAD", url)
432 headers = response.getheaders()
435 for header in headers:
436 if header[0] == "content-length":
437 if header[1] != data["content-length"][0]:
439 elif header[0] == "etag":
440 if header[1] != data["etag"][0]:
442 elif header[0] == "last-modified":
443 if header[1] != data["last-modified"][0]:
445 elif header[0] == "content-md5":
446 if header[1] != data["content-md5"][0]:
451 response = open_url("GET", url)
453 headers = response.getheaders()
454 feedhandle = response
456 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
459 return # don't need to do anything, nothings changed.
461 response = open_url("GET", url)
463 headers = response.getheaders()
464 feedhandle = response
466 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
469 fp = feedparser.parse(feedhandle)
470 db = dbm.open(os.path.join(statedir, "seen"), "c")
471 for item in fp["items"]:
472 # have we seen it before?
473 # need to work out what the content is first...
475 if item.has_key("content"):
476 content = item["content"][0]["value"]
478 content = item["summary"]
480 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
484 # check if there's a guid too - if that exists and we match the md5,
486 if item.has_key("guid"):
487 if db.has_key(url + "|" + item["guid"]):
488 data = db[url + "|" + item["guid"]]
489 data = cgi.parse_qs(data)
490 if data["contentmd5"][0] == md5sum:
493 if db.has_key(url + "|" + item["link"]):
494 data = db[url + "|" + item["link"]]
495 data = cgi.parse_qs(data)
496 if data.has_key("message-id"):
497 prevmessageid = data["message-id"][0]
498 if data["contentmd5"][0] == md5sum:
502 author = item["author"]
506 # create a basic email message
507 msg = MIMEMultipart("alternative")
509 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
513 string.ascii_letters + string.digits \
514 ) for a in range(0,6) \
515 ]) + "@" + socket.gethostname() + ">"
516 msg.add_header("Message-ID", messageid)
517 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
518 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
519 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
521 msg.add_header("References", prevmessageid)
522 createddate = datetime.datetime.now() \
523 .strftime("%a, %e %b %Y %T -0000")
525 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
526 .strftime("%a, %e %b %Y %T -0000")
529 msg.add_header("Date", createddate)
530 msg.add_header("Subject", item["title"])
531 msg.set_default_type("text/plain")
533 htmlcontent = content.encode("utf-8")
534 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
538 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
539 textparser = HTML2Text()
540 textparser.feed(content.encode("utf-8"))
541 textcontent = textparser.gettext()
542 textcontent = "%s\n\nItem URL: %s" %( \
545 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
549 # start by working out the filename we should be writting to, we do
550 # this following the normal maildir style rules
551 fname = str(os.getpid()) \
552 + "." + socket.gethostname() \
555 string.ascii_letters + string.digits \
556 ) for a in range(0,10) \
558 + datetime.datetime.now().strftime('%s')
559 fn = os.path.join(maildir, "tmp", fname)
561 fh.write(msg.as_string())
563 # now move it in to the new directory
564 newfn = os.path.join(maildir, "new", fname)
568 # now add to the database about the item
570 messageid = prevmessageid + " " + messageid
571 if item.has_key("guid") and item["guid"] != item["link"]:
572 data = urllib.urlencode(( \
573 ("message-id", messageid), \
574 ("created", createddate), \
575 ("contentmd5", md5sum) \
577 db[url + "|" + item["guid"]] = data
579 data = db[url + "|" + item["link"]]
580 data = cgi.parse_qs(data)
581 newdata = urllib.urlencode(( \
582 ("message-id", messageid), \
583 ("created", data["created"][0]), \
584 ("contentmd5", data["contentmd5"][0]) \
586 db[url + "|" + item["link"]] = newdata
588 db[url + "|" + item["link"]] = data
590 data = urllib.urlencode(( \
591 ("message-id", messageid), \
592 ("created", createddate), \
593 ("contentmd5", md5sum) \
595 db[url + "|" + item["link"]] = data
599 for header in headers:
601 ["content-md5", "etag", "last-modified", "content-length"]:
602 data.append((header[0], header[1]))
604 data = urllib.urlencode(data)
610 if __name__ == "__main__":
611 # This only gets executed if we really called the program
612 # first off, parse the command line arguments
614 oparser = OptionParser()
616 "-c", "--conf", dest="conf",
617 help="location of config file"
620 "-s", "--statedir", dest="statedir",
621 help="location of directory to store state in"
624 (options, args) = oparser.parse_args()
626 # check for the configfile
630 if options.conf != None:
631 # does the file exist?
633 os.stat(options.conf)
634 configfile = options.conf
636 # should exit here as the specified file doesn't exist
638 "Config file %s does not exist. Exiting.\n" %(options.conf,))
641 # check through the default locations
643 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
644 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
647 os.stat("/etc/rss2maildir.conf")
648 configfile = "/etc/rss2maildir.conf"
650 sys.stderr.write("No config file found. Exiting.\n")
653 # Right - if we've got this far, we've got a config file, now for the hard
656 scp = SafeConfigParser()
659 maildir_root = "RSSMaildir"
662 if options.statedir != None:
663 state_dir = options.statedir
665 mode = os.stat(state_dir)[stat.ST_MODE]
666 if not stat.S_ISDIR(mode):
668 "State directory (%s) is not a directory\n" %(state_dir))
671 # try to make the directory
675 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
677 elif scp.has_option("general", "state_dir"):
678 new_state_dir = scp.get("general", "state_dir")
680 mode = os.stat(state_dir)[stat.ST_MODE]
681 if not stat.S_ISDIR(mode):
683 "State directory (%s) is not a directory\n" %(state_dir))
688 os.mkdir(new_state_dir)
689 state_dir = new_state_dir
692 "Couldn't create state directory %s\n" %(new_state_dir))
696 mode = os.stat(state_dir)[stat.ST_MODE]
697 if not stat.S_ISDIR(mode):
699 "State directory %s is not a directory\n" %(state_dir))
706 "State directory %s could not be created\n" %(state_dir))
709 if scp.has_option("general", "maildir_root"):
710 maildir_root = scp.get("general", "maildir_root")
713 mode = os.stat(maildir_root)[stat.ST_MODE]
714 if not stat.S_ISDIR(mode):
716 "Maildir Root %s is not a directory\n" \
721 os.mkdir(maildir_root)
723 sys.stderr.write("Couldn't create Maildir Root %s\n" \
727 feeds = scp.sections()
729 feeds.remove("general")
733 for section in feeds:
734 # check if the directory exists
737 maildir = scp.get(section, "maildir")
741 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
742 maildir = os.path.join(maildir_root, maildir)
745 exists = os.stat(maildir)
746 if stat.S_ISDIR(exists[stat.ST_MODE]):
747 # check if there's a new, cur and tmp directory
749 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
751 os.mkdir(os.path.join(maildir, "cur"))
752 if not stat.S_ISDIR(mode):
753 sys.stderr.write("Broken maildir: %s\n" %(maildir))
755 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
757 os.mkdir(os.path.join(maildir, "tmp"))
758 if not stat.S_ISDIR(mode):
759 sys.stderr.write("Broken maildir: %s\n" %(maildir))
761 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
762 if not stat.S_ISDIR(mode):
763 sys.stderr.write("Broken maildir: %s\n" %(maildir))
765 os.mkdir(os.path.join(maildir, "new"))
767 sys.stderr.write("Broken maildir: %s\n" %(maildir))
772 sys.stderr.write("Couldn't create root maildir %s\n" \
776 os.mkdir(os.path.join(maildir, "new"))
777 os.mkdir(os.path.join(maildir, "cur"))
778 os.mkdir(os.path.join(maildir, "tmp"))
781 "Couldn't create required maildir directories for %s\n" \
785 # right - we've got the directories, we've got the section, we know the
788 parse_and_deliver(maildir, section, state_dir)