4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
96 HTMLParser.__init__(self)
98 def handle_starttag(self, tag, attrs):
99 tag_name = tag.lower()
100 if tag_name in self.blockleveltags:
101 # handle starting a new block - unless we're in a block element
102 # that can contain other blocks, we'll assume that we want to close
104 if tag_name == u'br':
105 self.handle_curdata()
106 self.opentags.append(tag_name)
109 if tag_name == u'ol':
110 self.handle_curdata()
111 self.listcount.append(1)
112 self.listlevel = len(self.listcount) - 1
114 if tag_name in self.liststarttags:
115 smallist = self.opentags[-3:]
117 for prev_listtag in smallist:
118 if prev_listtag in [u'dl', u'ol']:
119 self.indentlevel = self.indentlevel + 4
121 elif prev_listtag == u'ul':
122 self.indentlevel = self.indentlevel + 3
125 if len(self.opentags) > 0:
126 self.handle_curdata()
127 if tag_name not in self.cancontainflow:
129 self.opentags.append(tag_name)
133 listcount = self.listcount[-1]
136 self.handle_curdata()
137 self.opentags.append(tag_name)
139 def handle_startendtag(self, tag, attrs):
140 if tag.lower() == u'br':
141 self.tags.append(u'br')
142 self.handle_curdata() # just handle the data, don't do anything else
145 def handle_curdata(self):
146 if len(self.opentags) == 0:
149 if len(self.curdata) == 0:
152 if len(self.curdata.strip()) == 0:
155 tag_thats_done = self.opentags[-1]
157 if tag_thats_done in self.blockleveltags:
158 newlinerequired = self.text != u''
161 and len(self.text) > 2 \
162 and self.text[-1] != u'\n' \
163 and self.text[-2] != u'\n':
164 self.text = self.text + u'\n\n'
166 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
169 headingtext = self.curdata.encode("utf-8").strip()
170 seperator = u'\n' + u' '*self.indentlevel
171 headingtext = seperator.join( \
174 self.textwidth - self.indentlevel \
178 if tag_thats_done == u'h2':
180 elif tag_thats_done != u'h1':
183 if u'\n' in headingtext:
184 underline = u' ' * self.indentlevel \
185 + underlinechar * (self.textwidth - self.indentlevel)
187 underline = u' ' * self.indentlevel \
188 + underlinechar * len(headingtext)
189 self.text = self.text \
190 + headingtext.encode("utf-8") + u'\n' \
192 elif tag_thats_done == "p":
193 paragraph = self.curdata.encode("utf-8").strip()
194 seperator = u'\n' + u' ' * self.indentlevel
195 self.text = self.text \
196 + u' ' * self.indentlevel \
197 + seperator.join(textwrap.wrap(paragraph, self.textwidth - self.indentlevel))
198 elif tag_thats_done == "pre":
199 self.text = self.text + self.curdata
200 elif tag_thats_done == "blockquote":
201 quote = self.curdata.encode("utf-8").strip()
202 seperator = u'\n' + u' ' * self.indentlevel + u'> '
203 self.text = self.text \
208 self.textwidth - self.indentlevel - 2 \
211 elif tag_thats_done == "li":
212 item = self.curdata.encode("utf-8").strip()
213 if len(self.text) > 0 and self.text[-1] != u'\n':
214 self.text = self.text + u'\n'
215 # work out if we're in an ol rather than a ul
216 latesttags = self.opentags[-4:]
219 for thing in latesttags:
233 listmarker = u' %2d. ' %(self.listcount[-1])
234 self.listcount[-1] = self.listcount[-1] + 1
237 + u' ' * self.indentlevel \
239 self.text = self.text \
240 + u' ' * self.indentlevel \
245 self.textwidth - self.indentlevel - listindent \
249 elif tag_thats_done == "dt":
250 definition = self.curdata.encode("utf-8").strip()
251 if len(self.text) > 0 and self.text[-1] != u'\n':
252 self.text = self.text + u'\n\n'
253 elif len(self.text) > 1 and self.text[-2] != u'\n':
254 self.text = self.text + u'\n'
255 definition = definition + "::"
256 self.text = self.text \
258 textwrap.wrap(definition, self.textwidth - 1))
260 elif tag_thats_done == "dd":
261 definition = self.curdata.encode("utf-8").strip()
262 if len(definition) > 0:
263 if len(self.text) > 0 and self.text[-1] != u'\n':
264 self.text = self.text + u'\n'
265 self.text = self.text \
270 self.textwidth - self.indentlevel - 4 \
274 elif tag_thats_done in self.liststarttags:
277 # we've got no idea what this tag does, so we'll
278 # make an assumption that we're not going to know later
279 if len(self.curdata) > 0:
280 self.text = self.text \
283 textwrap.wrap(self.curdata, self.textwidth - 5))
286 if tag_thats_done in self.blockleveltags:
289 def handle_endtag(self, tag):
291 tagindex = self.opentags.index(tag)
293 # closing tag we know nothing about.
299 if tag in self.liststarttags:
300 if tag in [u'ol', u'dl', u'ul']:
301 # find if there was a previous list level
302 smalllist = self.opentags[:-1]
304 for prev_listtag in smalllist:
305 if prev_listtag in [u'ol', u'dl']:
306 self.indentlevel = self.indentlevel - 4
308 elif prev_listtag == u'ul':
309 self.indentlevel = self.indentlevel - 3
313 self.listcount = self.listcount[:-1]
315 while tagindex < len(self.opentags) \
316 and tag in self.opentags[tagindex+1:]:
318 tagindex = self.opentags.index(tag, tagindex+1)
320 # well, we don't want to do that then
322 if tagindex != len(self.opentags) - 1:
323 # Assuming the data was for the last opened tag first
324 self.handle_curdata()
325 # Now kill the list to be a slice before this tag was opened
326 self.opentags = self.opentags[:tagindex + 1]
328 self.handle_curdata()
329 if self.opentags[-1] == tag:
332 def handle_data(self, data):
333 self.curdata = self.curdata + unicode(data, "utf-8")
335 def handle_entityref(self, name):
337 if HTML2Text.entities.has_key(name.lower()):
338 entity = HTML2Text.entities[name.lower()]
340 entity = unichr(int(name[1:]))
342 entity = "&" + name + ";"
344 self.curdata = self.curdata + unicode(entity, "utf-8")
347 self.handle_curdata()
348 if len(self.text) == 0 or self.text[-1] != u'\n':
349 self.text = self.text + u'\n'
351 if len(self.text) > 0:
352 while len(self.text) > 1 and self.text[-1] == u'\n':
353 self.text = self.text[:-1]
354 self.text = self.text + u'\n'
357 def open_url(method, url):
359 while redirectcount < 3:
360 (type, rest) = urllib.splittype(url)
361 (host, path) = urllib.splithost(rest)
362 (host, port) = urllib.splitport(host)
366 conn = httplib.HTTPConnection("%s:%s" %(host, port))
367 conn.request(method, path)
368 response = conn.getresponse()
369 if response.status in [301, 302, 303, 307]:
370 headers = response.getheaders()
371 for header in headers:
372 if header[0] == "location":
374 elif response.status == 200:
378 redirectcount = redirectcount + 1
381 def parse_and_deliver(maildir, url, statedir):
384 # first check if we know about this feed already
385 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
386 if feeddb.has_key(url):
388 data = cgi.parse_qs(data)
389 response = open_url("HEAD", url)
392 headers = response.getheaders()
395 for header in headers:
396 if header[0] == "content-length":
397 if header[1] != data["content-length"][0]:
399 elif header[0] == "etag":
400 if header[1] != data["etag"][0]:
402 elif header[0] == "last-modified":
403 if header[1] != data["last-modified"][0]:
405 elif header[0] == "content-md5":
406 if header[1] != data["content-md5"][0]:
411 response = open_url("GET", url)
413 headers = response.getheaders()
414 feedhandle = response
416 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
419 return # don't need to do anything, nothings changed.
421 response = open_url("GET", url)
423 headers = response.getheaders()
424 feedhandle = response
426 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
429 fp = feedparser.parse(feedhandle)
430 db = dbm.open(os.path.join(statedir, "seen"), "c")
431 for item in fp["items"]:
432 # have we seen it before?
433 # need to work out what the content is first...
435 if item.has_key("content"):
436 content = item["content"][0]["value"]
438 content = item["summary"]
440 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
444 # check if there's a guid too - if that exists and we match the md5,
446 if item.has_key("guid"):
447 if db.has_key(url + "|" + item["guid"]):
448 data = db[url + "|" + item["guid"]]
449 data = cgi.parse_qs(data)
450 if data["contentmd5"][0] == md5sum:
453 if db.has_key(url + "|" + item["link"]):
454 data = db[url + "|" + item["link"]]
455 data = cgi.parse_qs(data)
456 if data.has_key("message-id"):
457 prevmessageid = data["message-id"][0]
458 if data["contentmd5"][0] == md5sum:
462 author = item["author"]
466 # create a basic email message
467 msg = MIMEMultipart("alternative")
469 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
473 string.ascii_letters + string.digits \
474 ) for a in range(0,6) \
475 ]) + "@" + socket.gethostname() + ">"
476 msg.add_header("Message-ID", messageid)
477 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
478 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
479 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
481 msg.add_header("References", prevmessageid)
482 createddate = datetime.datetime.now() \
483 .strftime("%a, %e %b %Y %T -0000")
485 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
486 .strftime("%a, %e %b %Y %T -0000")
489 msg.add_header("Date", createddate)
490 msg.add_header("Subject", item["title"])
491 msg.set_default_type("text/plain")
493 htmlcontent = content.encode("utf-8")
494 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
498 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
499 textparser = HTML2Text()
500 textparser.feed(content.encode("utf-8"))
501 textcontent = textparser.gettext()
502 textcontent = "%s\n\nItem URL: %s" %( \
505 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
509 # start by working out the filename we should be writting to, we do
510 # this following the normal maildir style rules
511 fname = str(os.getpid()) \
512 + "." + socket.gethostname() \
515 string.ascii_letters + string.digits \
516 ) for a in range(0,10) \
518 + datetime.datetime.now().strftime('%s')
519 fn = os.path.join(maildir, "tmp", fname)
521 fh.write(msg.as_string())
523 # now move it in to the new directory
524 newfn = os.path.join(maildir, "new", fname)
528 # now add to the database about the item
530 messageid = prevmessageid + " " + messageid
531 if item.has_key("guid") and item["guid"] != item["link"]:
532 data = urllib.urlencode(( \
533 ("message-id", messageid), \
534 ("created", createddate), \
535 ("contentmd5", md5sum) \
537 db[url + "|" + item["guid"]] = data
539 data = db[url + "|" + item["link"]]
540 data = cgi.parse_qs(data)
541 newdata = urllib.urlencode(( \
542 ("message-id", messageid), \
543 ("created", data["created"][0]), \
544 ("contentmd5", data["contentmd5"][0]) \
546 db[url + "|" + item["link"]] = newdata
548 db[url + "|" + item["link"]] = data
550 data = urllib.urlencode(( \
551 ("message-id", messageid), \
552 ("created", createddate), \
553 ("contentmd5", md5sum) \
555 db[url + "|" + item["link"]] = data
559 for header in headers:
560 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
561 data.append((header[0], header[1]))
563 data = urllib.urlencode(data)
569 if __name__ == "__main__":
570 # This only gets executed if we really called the program
571 # first off, parse the command line arguments
573 oparser = OptionParser()
575 "-c", "--conf", dest="conf",
576 help="location of config file"
579 "-s", "--statedir", dest="statedir",
580 help="location of directory to store state in"
583 (options, args) = oparser.parse_args()
585 # check for the configfile
589 if options.conf != None:
590 # does the file exist?
592 os.stat(options.conf)
593 configfile = options.conf
595 # should exit here as the specified file doesn't exist
597 "Config file %s does not exist. Exiting.\n" %(options.conf,))
600 # check through the default locations
602 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
603 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
606 os.stat("/etc/rss2maildir.conf")
607 configfile = "/etc/rss2maildir.conf"
609 sys.stderr.write("No config file found. Exiting.\n")
612 # Right - if we've got this far, we've got a config file, now for the hard
615 scp = SafeConfigParser()
618 maildir_root = "RSSMaildir"
621 if options.statedir != None:
622 state_dir = options.statedir
624 mode = os.stat(state_dir)[stat.ST_MODE]
625 if not stat.S_ISDIR(mode):
627 "State directory (%s) is not a directory\n" %(state_dir))
630 # try to make the directory
634 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
636 elif scp.has_option("general", "state_dir"):
637 new_state_dir = scp.get("general", "state_dir")
639 mode = os.stat(state_dir)[stat.ST_MODE]
640 if not stat.S_ISDIR(mode):
642 "State directory (%s) is not a directory\n" %(state_dir))
647 os.mkdir(new_state_dir)
648 state_dir = new_state_dir
651 "Couldn't create state directory %s\n" %(new_state_dir))
655 mode = os.stat(state_dir)[stat.ST_MODE]
656 if not stat.S_ISDIR(mode):
658 "State directory %s is not a directory\n" %(state_dir))
665 "State directory %s could not be created\n" %(state_dir))
668 if scp.has_option("general", "maildir_root"):
669 maildir_root = scp.get("general", "maildir_root")
672 mode = os.stat(maildir_root)[stat.ST_MODE]
673 if not stat.S_ISDIR(mode):
675 "Maildir Root %s is not a directory\n" \
680 os.mkdir(maildir_root)
682 sys.stderr.write("Couldn't create Maildir Root %s\n" \
686 feeds = scp.sections()
688 feeds.remove("general")
692 for section in feeds:
693 # check if the directory exists
696 maildir = scp.get(section, "maildir")
700 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
701 maildir = os.path.join(maildir_root, maildir)
704 exists = os.stat(maildir)
705 if stat.S_ISDIR(exists[stat.ST_MODE]):
706 # check if there's a new, cur and tmp directory
708 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
710 os.mkdir(os.path.join(maildir, "cur"))
711 if not stat.S_ISDIR(mode):
712 sys.stderr.write("Broken maildir: %s\n" %(maildir))
714 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
716 os.mkdir(os.path.join(maildir, "tmp"))
717 if not stat.S_ISDIR(mode):
718 sys.stderr.write("Broken maildir: %s\n" %(maildir))
720 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
721 if not stat.S_ISDIR(mode):
722 sys.stderr.write("Broken maildir: %s\n" %(maildir))
724 os.mkdir(os.path.join(maildir, "new"))
726 sys.stderr.write("Broken maildir: %s\n" %(maildir))
731 sys.stderr.write("Couldn't create root maildir %s\n" \
735 os.mkdir(os.path.join(maildir, "new"))
736 os.mkdir(os.path.join(maildir, "cur"))
737 os.mkdir(os.path.join(maildir, "tmp"))
740 "Couldn't create required maildir directories for %s\n" \
744 # right - we've got the directories, we've got the section, we know the
747 parse_and_deliver(maildir, section, state_dir)