4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
96 HTMLParser.__init__(self)
98 def handle_starttag(self, tag, attrs):
99 tag_name = tag.lower()
100 if tag_name in self.blockleveltags:
101 # handle starting a new block - unless we're in a block element
102 # that can contain other blocks, we'll assume that we want to close
104 if tag_name == u'br':
105 self.handle_curdata()
106 self.opentags.append(tag_name)
109 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
110 self.handle_curdata()
112 if tag_name == u'ol':
113 self.handle_curdata()
114 self.listcount.append(1)
115 self.listlevel = len(self.listcount) - 1
117 if tag_name in self.liststarttags:
118 smallist = self.opentags[-3:-1]
120 for prev_listtag in smallist:
121 if prev_listtag in [u'dl', u'ol']:
122 self.indentlevel = self.indentlevel + 4
124 elif prev_listtag == u'ul':
125 self.indentlevel = self.indentlevel + 3
128 if len(self.opentags) > 0:
129 self.handle_curdata()
130 if tag_name not in self.cancontainflow:
132 self.opentags.append(tag_name)
136 listcount = self.listcount[-1]
140 if tag_name == u'dd' and len(self.opentags) > 1 \
141 and self.opentags[-1] == u'dt':
142 self.handle_curdata()
144 elif tag_name == u'dt' and len(self.opentags) > 1 \
145 and self.opentags[-1] == u'dd':
146 self.handle_curdata()
149 self.handle_curdata()
150 self.opentags.append(tag_name)
152 def handle_startendtag(self, tag, attrs):
153 if tag.lower() == u'br':
154 self.opentags.append(u'br')
155 self.handle_curdata() # just handle the data, don't do anything else
158 def handle_curdata(self):
159 if len(self.opentags) == 0:
162 if len(self.curdata) == 0:
165 if len(self.curdata.strip()) == 0:
168 tag_thats_done = self.opentags[-1]
170 if tag_thats_done in self.blockleveltags:
171 newlinerequired = self.text != u''
174 and len(self.text) > 2 \
175 and self.text[-1] != u'\n' \
176 and self.text[-2] != u'\n':
177 self.text = self.text + u'\n\n'
179 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
182 headingtext = unicode( \
183 self.curdata.encode("utf-8").strip(), "utf-8")
184 seperator = u'\n' + u' '*self.indentlevel
185 headingtext = seperator.join( \
188 self.textwidth - self.indentlevel \
192 if tag_thats_done == u'h2':
194 elif tag_thats_done != u'h1':
197 if u'\n' in headingtext:
198 underline = u' ' * self.indentlevel \
199 + underlinechar * (self.textwidth - self.indentlevel)
201 underline = u' ' * self.indentlevel \
202 + underlinechar * len(headingtext)
203 self.text = self.text \
204 + headingtext.encode("utf-8") + u'\n' \
206 elif tag_thats_done == u'p':
207 paragraph = unicode( \
208 self.curdata.strip().encode("utf-8"), "utf-8")
209 seperator = u'\n' + u' ' * self.indentlevel
210 self.text = self.text \
211 + u' ' * self.indentlevel \
214 paragraph, self.textwidth - self.indentlevel))
215 elif tag_thats_done == "pre":
216 self.text = self.text + unicode( \
217 self.curdata.encode("utf-8"), "utf-8")
218 elif tag_thats_done == "blockquote":
220 self.curdata.encode("utf-8").strip(), "utf-8")
221 seperator = u'\n' + u' ' * self.indentlevel + u'> '
222 self.text = self.text \
227 self.textwidth - self.indentlevel - 2 \
230 elif tag_thats_done == "li":
231 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
232 if len(self.text) > 0 and self.text[-1] != u'\n':
233 self.text = self.text + u'\n'
234 # work out if we're in an ol rather than a ul
235 latesttags = self.opentags[-4:]
238 for thing in latesttags:
252 listmarker = u' %2d. ' %(self.listcount[-1])
253 self.listcount[-1] = self.listcount[-1] + 1
256 + u' ' * self.indentlevel \
258 self.text = self.text \
259 + u' ' * self.indentlevel \
264 self.textwidth - self.indentlevel - listindent \
268 elif tag_thats_done == u'dt':
269 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
270 if len(self.text) > 0 and self.text[-1] != u'\n':
271 self.text = self.text + u'\n\n'
272 elif len(self.text) > 1 and self.text[-2] != u'\n':
273 self.text = self.text + u'\n'
274 definition = u' ' * self.indentlevel + definition + "::"
275 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
276 self.text = self.text \
278 textwrap.wrap(definition, \
279 self.textwidth - self.indentlevel - 1))
281 elif tag_thats_done == u'dd':
282 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
283 if len(definition) > 0:
284 if len(self.text) > 0 and self.text[-1] != u'\n':
285 self.text = self.text + u'\n'
286 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
287 self.text = self.text \
288 + u' ' * (self.indentlevel + 4) \
289 + indentstring.join( \
292 self.textwidth - self.indentlevel - 4 \
296 elif tag_thats_done in self.liststarttags:
299 # we've got no idea what this tag does, so we'll
300 # make an assumption that we're not going to know later
301 if len(self.curdata) > 0:
302 self.text = self.text \
307 self.curdata.encode("utf-8").strip(), \
308 "utf-8"), self.textwidth - 5))
311 if tag_thats_done in self.blockleveltags:
314 def handle_endtag(self, tag):
316 tagindex = self.opentags.index(tag)
318 # closing tag we know nothing about.
324 if tag in self.liststarttags:
325 if tag in [u'ol', u'dl', u'ul']:
326 self.handle_curdata()
327 # find if there was a previous list level
328 smalllist = self.opentags[:-1]
330 for prev_listtag in smalllist:
331 if prev_listtag in [u'ol', u'dl']:
332 self.indentlevel = self.indentlevel - 4
334 elif prev_listtag == u'ul':
335 self.indentlevel = self.indentlevel - 3
339 self.listcount = self.listcount[:-1]
341 while tagindex < len(self.opentags) \
342 and tag in self.opentags[tagindex+1:]:
344 tagindex = self.opentags.index(tag, tagindex+1)
346 # well, we don't want to do that then
348 if tagindex != len(self.opentags) - 1:
349 # Assuming the data was for the last opened tag first
350 self.handle_curdata()
351 # Now kill the list to be a slice before this tag was opened
352 self.opentags = self.opentags[:tagindex + 1]
354 self.handle_curdata()
355 if self.opentags[-1] == tag:
358 def handle_data(self, data):
359 self.curdata = self.curdata + unicode(data, "utf-8")
361 def handle_entityref(self, name):
363 if HTML2Text.entities.has_key(name.lower()):
364 entity = HTML2Text.entities[name.lower()]
366 entity = unichr(int(name[1:]))
368 entity = "&" + name + ";"
370 self.curdata = self.curdata + unicode(entity, "utf-8")
373 self.handle_curdata()
374 if len(self.text) == 0 or self.text[-1] != u'\n':
375 self.text = self.text + u'\n'
377 if len(self.text) > 0:
378 while len(self.text) > 1 and self.text[-1] == u'\n':
379 self.text = self.text[:-1]
380 self.text = self.text + u'\n'
383 def open_url(method, url):
385 while redirectcount < 3:
386 (type, rest) = urllib.splittype(url)
387 (host, path) = urllib.splithost(rest)
388 (host, port) = urllib.splitport(host)
392 conn = httplib.HTTPConnection("%s:%s" %(host, port))
393 conn.request(method, path)
394 response = conn.getresponse()
395 if response.status in [301, 302, 303, 307]:
396 headers = response.getheaders()
397 for header in headers:
398 if header[0] == "location":
400 elif response.status == 200:
404 redirectcount = redirectcount + 1
407 def parse_and_deliver(maildir, url, statedir):
410 # first check if we know about this feed already
411 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
412 if feeddb.has_key(url):
414 data = cgi.parse_qs(data)
415 response = open_url("HEAD", url)
418 headers = response.getheaders()
421 for header in headers:
422 if header[0] == "content-length":
423 if header[1] != data["content-length"][0]:
425 elif header[0] == "etag":
426 if header[1] != data["etag"][0]:
428 elif header[0] == "last-modified":
429 if header[1] != data["last-modified"][0]:
431 elif header[0] == "content-md5":
432 if header[1] != data["content-md5"][0]:
437 response = open_url("GET", url)
439 headers = response.getheaders()
440 feedhandle = response
442 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
445 return # don't need to do anything, nothings changed.
447 response = open_url("GET", url)
449 headers = response.getheaders()
450 feedhandle = response
452 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
455 fp = feedparser.parse(feedhandle)
456 db = dbm.open(os.path.join(statedir, "seen"), "c")
457 for item in fp["items"]:
458 # have we seen it before?
459 # need to work out what the content is first...
461 if item.has_key("content"):
462 content = item["content"][0]["value"]
464 content = item["summary"]
466 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
470 # check if there's a guid too - if that exists and we match the md5,
472 if item.has_key("guid"):
473 if db.has_key(url + "|" + item["guid"]):
474 data = db[url + "|" + item["guid"]]
475 data = cgi.parse_qs(data)
476 if data["contentmd5"][0] == md5sum:
479 if db.has_key(url + "|" + item["link"]):
480 data = db[url + "|" + item["link"]]
481 data = cgi.parse_qs(data)
482 if data.has_key("message-id"):
483 prevmessageid = data["message-id"][0]
484 if data["contentmd5"][0] == md5sum:
488 author = item["author"]
492 # create a basic email message
493 msg = MIMEMultipart("alternative")
495 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
499 string.ascii_letters + string.digits \
500 ) for a in range(0,6) \
501 ]) + "@" + socket.gethostname() + ">"
502 msg.add_header("Message-ID", messageid)
503 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
504 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
505 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
507 msg.add_header("References", prevmessageid)
508 createddate = datetime.datetime.now() \
509 .strftime("%a, %e %b %Y %T -0000")
511 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
512 .strftime("%a, %e %b %Y %T -0000")
515 msg.add_header("Date", createddate)
516 msg.add_header("Subject", item["title"])
517 msg.set_default_type("text/plain")
519 htmlcontent = content.encode("utf-8")
520 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
524 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
525 textparser = HTML2Text()
526 textparser.feed(content.encode("utf-8"))
527 textcontent = textparser.gettext()
528 textcontent = "%s\n\nItem URL: %s" %( \
531 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
535 # start by working out the filename we should be writting to, we do
536 # this following the normal maildir style rules
537 fname = str(os.getpid()) \
538 + "." + socket.gethostname() \
541 string.ascii_letters + string.digits \
542 ) for a in range(0,10) \
544 + datetime.datetime.now().strftime('%s')
545 fn = os.path.join(maildir, "tmp", fname)
547 fh.write(msg.as_string())
549 # now move it in to the new directory
550 newfn = os.path.join(maildir, "new", fname)
554 # now add to the database about the item
556 messageid = prevmessageid + " " + messageid
557 if item.has_key("guid") and item["guid"] != item["link"]:
558 data = urllib.urlencode(( \
559 ("message-id", messageid), \
560 ("created", createddate), \
561 ("contentmd5", md5sum) \
563 db[url + "|" + item["guid"]] = data
565 data = db[url + "|" + item["link"]]
566 data = cgi.parse_qs(data)
567 newdata = urllib.urlencode(( \
568 ("message-id", messageid), \
569 ("created", data["created"][0]), \
570 ("contentmd5", data["contentmd5"][0]) \
572 db[url + "|" + item["link"]] = newdata
574 db[url + "|" + item["link"]] = data
576 data = urllib.urlencode(( \
577 ("message-id", messageid), \
578 ("created", createddate), \
579 ("contentmd5", md5sum) \
581 db[url + "|" + item["link"]] = data
585 for header in headers:
587 ["content-md5", "etag", "last-modified", "content-length"]:
588 data.append((header[0], header[1]))
590 data = urllib.urlencode(data)
596 if __name__ == "__main__":
597 # This only gets executed if we really called the program
598 # first off, parse the command line arguments
600 oparser = OptionParser()
602 "-c", "--conf", dest="conf",
603 help="location of config file"
606 "-s", "--statedir", dest="statedir",
607 help="location of directory to store state in"
610 (options, args) = oparser.parse_args()
612 # check for the configfile
616 if options.conf != None:
617 # does the file exist?
619 os.stat(options.conf)
620 configfile = options.conf
622 # should exit here as the specified file doesn't exist
624 "Config file %s does not exist. Exiting.\n" %(options.conf,))
627 # check through the default locations
629 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
630 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
633 os.stat("/etc/rss2maildir.conf")
634 configfile = "/etc/rss2maildir.conf"
636 sys.stderr.write("No config file found. Exiting.\n")
639 # Right - if we've got this far, we've got a config file, now for the hard
642 scp = SafeConfigParser()
645 maildir_root = "RSSMaildir"
648 if options.statedir != None:
649 state_dir = options.statedir
651 mode = os.stat(state_dir)[stat.ST_MODE]
652 if not stat.S_ISDIR(mode):
654 "State directory (%s) is not a directory\n" %(state_dir))
657 # try to make the directory
661 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
663 elif scp.has_option("general", "state_dir"):
664 new_state_dir = scp.get("general", "state_dir")
666 mode = os.stat(state_dir)[stat.ST_MODE]
667 if not stat.S_ISDIR(mode):
669 "State directory (%s) is not a directory\n" %(state_dir))
674 os.mkdir(new_state_dir)
675 state_dir = new_state_dir
678 "Couldn't create state directory %s\n" %(new_state_dir))
682 mode = os.stat(state_dir)[stat.ST_MODE]
683 if not stat.S_ISDIR(mode):
685 "State directory %s is not a directory\n" %(state_dir))
692 "State directory %s could not be created\n" %(state_dir))
695 if scp.has_option("general", "maildir_root"):
696 maildir_root = scp.get("general", "maildir_root")
699 mode = os.stat(maildir_root)[stat.ST_MODE]
700 if not stat.S_ISDIR(mode):
702 "Maildir Root %s is not a directory\n" \
707 os.mkdir(maildir_root)
709 sys.stderr.write("Couldn't create Maildir Root %s\n" \
713 feeds = scp.sections()
715 feeds.remove("general")
719 for section in feeds:
720 # check if the directory exists
723 maildir = scp.get(section, "maildir")
727 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
728 maildir = os.path.join(maildir_root, maildir)
731 exists = os.stat(maildir)
732 if stat.S_ISDIR(exists[stat.ST_MODE]):
733 # check if there's a new, cur and tmp directory
735 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
737 os.mkdir(os.path.join(maildir, "cur"))
738 if not stat.S_ISDIR(mode):
739 sys.stderr.write("Broken maildir: %s\n" %(maildir))
741 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
743 os.mkdir(os.path.join(maildir, "tmp"))
744 if not stat.S_ISDIR(mode):
745 sys.stderr.write("Broken maildir: %s\n" %(maildir))
747 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
748 if not stat.S_ISDIR(mode):
749 sys.stderr.write("Broken maildir: %s\n" %(maildir))
751 os.mkdir(os.path.join(maildir, "new"))
753 sys.stderr.write("Broken maildir: %s\n" %(maildir))
758 sys.stderr.write("Couldn't create root maildir %s\n" \
762 os.mkdir(os.path.join(maildir, "new"))
763 os.mkdir(os.path.join(maildir, "cur"))
764 os.mkdir(os.path.join(maildir, "tmp"))
767 "Couldn't create required maildir directories for %s\n" \
771 # right - we've got the directories, we've got the section, we know the
774 parse_and_deliver(maildir, section, state_dir)