4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
96 HTMLParser.__init__(self)
98 def handle_starttag(self, tag, attrs):
99 tag_name = tag.lower()
100 if tag_name in self.blockleveltags:
101 # handle starting a new block - unless we're in a block element
102 # that can contain other blocks, we'll assume that we want to close
104 if tag_name == u'br':
105 self.handle_curdata()
106 self.opentags.append(tag_name)
109 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
110 self.handle_curdata()
112 if tag_name == u'ol':
113 self.handle_curdata()
114 self.listcount.append(1)
115 self.listlevel = len(self.listcount) - 1
117 if tag_name in self.liststarttags:
118 smallist = self.opentags[-3:-1]
120 for prev_listtag in smallist:
121 if prev_listtag in [u'dl', u'ol']:
122 self.indentlevel = self.indentlevel + 4
124 elif prev_listtag == u'ul':
125 self.indentlevel = self.indentlevel + 3
128 if len(self.opentags) > 0:
129 self.handle_curdata()
130 if tag_name not in self.cancontainflow:
132 self.opentags.append(tag_name)
136 listcount = self.listcount[-1]
140 if tag_name == u'dd' and len(self.opentags) > 1 \
141 and self.opentags[-1] == u'dt':
142 self.handle_curdata()
144 elif tag_name == u'dt' and len(self.opentags) > 1 \
145 and self.opentags[-1] == u'dd':
146 self.handle_curdata()
149 self.handle_curdata()
150 self.opentags.append(tag_name)
152 def handle_startendtag(self, tag, attrs):
153 if tag.lower() == u'br':
154 self.tags.append(u'br')
155 self.handle_curdata() # just handle the data, don't do anything else
158 def handle_curdata(self):
159 if len(self.opentags) == 0:
162 if len(self.curdata) == 0:
165 if len(self.curdata.strip()) == 0:
168 tag_thats_done = self.opentags[-1]
170 if tag_thats_done in self.blockleveltags:
171 newlinerequired = self.text != u''
174 and len(self.text) > 2 \
175 and self.text[-1] != u'\n' \
176 and self.text[-2] != u'\n':
177 self.text = self.text + u'\n\n'
179 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
182 headingtext = self.curdata.encode("utf-8").strip()
183 seperator = u'\n' + u' '*self.indentlevel
184 headingtext = seperator.join( \
187 self.textwidth - self.indentlevel \
191 if tag_thats_done == u'h2':
193 elif tag_thats_done != u'h1':
196 if u'\n' in headingtext:
197 underline = u' ' * self.indentlevel \
198 + underlinechar * (self.textwidth - self.indentlevel)
200 underline = u' ' * self.indentlevel \
201 + underlinechar * len(headingtext)
202 self.text = self.text \
203 + headingtext.encode("utf-8") + u'\n' \
205 elif tag_thats_done == u'p':
206 paragraph = self.curdata.encode("utf-8").strip()
207 seperator = u'\n' + u' ' * self.indentlevel
208 self.text = self.text \
209 + u' ' * self.indentlevel \
210 + seperator.join(textwrap.wrap(paragraph, self.textwidth - self.indentlevel))
211 elif tag_thats_done == "pre":
212 self.text = self.text + self.curdata
213 elif tag_thats_done == "blockquote":
214 quote = self.curdata.encode("utf-8").strip()
215 seperator = u'\n' + u' ' * self.indentlevel + u'> '
216 self.text = self.text \
221 self.textwidth - self.indentlevel - 2 \
224 elif tag_thats_done == "li":
225 item = self.curdata.encode("utf-8").strip()
226 if len(self.text) > 0 and self.text[-1] != u'\n':
227 self.text = self.text + u'\n'
228 # work out if we're in an ol rather than a ul
229 latesttags = self.opentags[-4:]
232 for thing in latesttags:
246 listmarker = u' %2d. ' %(self.listcount[-1])
247 self.listcount[-1] = self.listcount[-1] + 1
250 + u' ' * self.indentlevel \
252 self.text = self.text \
253 + u' ' * self.indentlevel \
258 self.textwidth - self.indentlevel - listindent \
262 elif tag_thats_done == u'dt':
263 definition = self.curdata.encode("utf-8").strip()
264 if len(self.text) > 0 and self.text[-1] != u'\n':
265 self.text = self.text + u'\n\n'
266 elif len(self.text) > 1 and self.text[-2] != u'\n':
267 self.text = self.text + u'\n'
268 definition = u' ' * self.indentlevel + definition + "::"
269 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
270 self.text = self.text \
272 textwrap.wrap(definition, \
273 self.textwidth - self.indentlevel - 1))
275 elif tag_thats_done == u'dd':
276 definition = self.curdata.encode("utf-8").strip()
277 if len(definition) > 0:
278 if len(self.text) > 0 and self.text[-1] != u'\n':
279 self.text = self.text + u'\n'
280 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
281 self.text = self.text \
282 + u' ' * (self.indentlevel + 4) \
283 + indentstring.join( \
286 self.textwidth - self.indentlevel - 4 \
290 elif tag_thats_done in self.liststarttags:
293 # we've got no idea what this tag does, so we'll
294 # make an assumption that we're not going to know later
295 if len(self.curdata) > 0:
296 self.text = self.text \
299 textwrap.wrap(self.curdata, self.textwidth - 5))
302 if tag_thats_done in self.blockleveltags:
305 def handle_endtag(self, tag):
307 tagindex = self.opentags.index(tag)
309 # closing tag we know nothing about.
315 if tag in self.liststarttags:
316 if tag in [u'ol', u'dl', u'ul']:
317 self.handle_curdata()
318 # find if there was a previous list level
319 smalllist = self.opentags[:-1]
321 for prev_listtag in smalllist:
322 if prev_listtag in [u'ol', u'dl']:
323 self.indentlevel = self.indentlevel - 4
325 elif prev_listtag == u'ul':
326 self.indentlevel = self.indentlevel - 3
330 self.listcount = self.listcount[:-1]
332 while tagindex < len(self.opentags) \
333 and tag in self.opentags[tagindex+1:]:
335 tagindex = self.opentags.index(tag, tagindex+1)
337 # well, we don't want to do that then
339 if tagindex != len(self.opentags) - 1:
340 # Assuming the data was for the last opened tag first
341 self.handle_curdata()
342 # Now kill the list to be a slice before this tag was opened
343 self.opentags = self.opentags[:tagindex + 1]
345 self.handle_curdata()
346 if self.opentags[-1] == tag:
349 def handle_data(self, data):
350 self.curdata = self.curdata + unicode(data, "utf-8")
352 def handle_entityref(self, name):
354 if HTML2Text.entities.has_key(name.lower()):
355 entity = HTML2Text.entities[name.lower()]
357 entity = unichr(int(name[1:]))
359 entity = "&" + name + ";"
361 self.curdata = self.curdata + unicode(entity, "utf-8")
364 self.handle_curdata()
365 if len(self.text) == 0 or self.text[-1] != u'\n':
366 self.text = self.text + u'\n'
368 if len(self.text) > 0:
369 while len(self.text) > 1 and self.text[-1] == u'\n':
370 self.text = self.text[:-1]
371 self.text = self.text + u'\n'
374 def open_url(method, url):
376 while redirectcount < 3:
377 (type, rest) = urllib.splittype(url)
378 (host, path) = urllib.splithost(rest)
379 (host, port) = urllib.splitport(host)
383 conn = httplib.HTTPConnection("%s:%s" %(host, port))
384 conn.request(method, path)
385 response = conn.getresponse()
386 if response.status in [301, 302, 303, 307]:
387 headers = response.getheaders()
388 for header in headers:
389 if header[0] == "location":
391 elif response.status == 200:
395 redirectcount = redirectcount + 1
398 def parse_and_deliver(maildir, url, statedir):
401 # first check if we know about this feed already
402 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
403 if feeddb.has_key(url):
405 data = cgi.parse_qs(data)
406 response = open_url("HEAD", url)
409 headers = response.getheaders()
412 for header in headers:
413 if header[0] == "content-length":
414 if header[1] != data["content-length"][0]:
416 elif header[0] == "etag":
417 if header[1] != data["etag"][0]:
419 elif header[0] == "last-modified":
420 if header[1] != data["last-modified"][0]:
422 elif header[0] == "content-md5":
423 if header[1] != data["content-md5"][0]:
428 response = open_url("GET", url)
430 headers = response.getheaders()
431 feedhandle = response
433 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
436 return # don't need to do anything, nothings changed.
438 response = open_url("GET", url)
440 headers = response.getheaders()
441 feedhandle = response
443 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
446 fp = feedparser.parse(feedhandle)
447 db = dbm.open(os.path.join(statedir, "seen"), "c")
448 for item in fp["items"]:
449 # have we seen it before?
450 # need to work out what the content is first...
452 if item.has_key("content"):
453 content = item["content"][0]["value"]
455 content = item["summary"]
457 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
461 # check if there's a guid too - if that exists and we match the md5,
463 if item.has_key("guid"):
464 if db.has_key(url + "|" + item["guid"]):
465 data = db[url + "|" + item["guid"]]
466 data = cgi.parse_qs(data)
467 if data["contentmd5"][0] == md5sum:
470 if db.has_key(url + "|" + item["link"]):
471 data = db[url + "|" + item["link"]]
472 data = cgi.parse_qs(data)
473 if data.has_key("message-id"):
474 prevmessageid = data["message-id"][0]
475 if data["contentmd5"][0] == md5sum:
479 author = item["author"]
483 # create a basic email message
484 msg = MIMEMultipart("alternative")
486 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
490 string.ascii_letters + string.digits \
491 ) for a in range(0,6) \
492 ]) + "@" + socket.gethostname() + ">"
493 msg.add_header("Message-ID", messageid)
494 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
495 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
496 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
498 msg.add_header("References", prevmessageid)
499 createddate = datetime.datetime.now() \
500 .strftime("%a, %e %b %Y %T -0000")
502 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
503 .strftime("%a, %e %b %Y %T -0000")
506 msg.add_header("Date", createddate)
507 msg.add_header("Subject", item["title"])
508 msg.set_default_type("text/plain")
510 htmlcontent = content.encode("utf-8")
511 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
515 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
516 textparser = HTML2Text()
517 textparser.feed(content.encode("utf-8"))
518 textcontent = textparser.gettext()
519 textcontent = "%s\n\nItem URL: %s" %( \
522 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
526 # start by working out the filename we should be writting to, we do
527 # this following the normal maildir style rules
528 fname = str(os.getpid()) \
529 + "." + socket.gethostname() \
532 string.ascii_letters + string.digits \
533 ) for a in range(0,10) \
535 + datetime.datetime.now().strftime('%s')
536 fn = os.path.join(maildir, "tmp", fname)
538 fh.write(msg.as_string())
540 # now move it in to the new directory
541 newfn = os.path.join(maildir, "new", fname)
545 # now add to the database about the item
547 messageid = prevmessageid + " " + messageid
548 if item.has_key("guid") and item["guid"] != item["link"]:
549 data = urllib.urlencode(( \
550 ("message-id", messageid), \
551 ("created", createddate), \
552 ("contentmd5", md5sum) \
554 db[url + "|" + item["guid"]] = data
556 data = db[url + "|" + item["link"]]
557 data = cgi.parse_qs(data)
558 newdata = urllib.urlencode(( \
559 ("message-id", messageid), \
560 ("created", data["created"][0]), \
561 ("contentmd5", data["contentmd5"][0]) \
563 db[url + "|" + item["link"]] = newdata
565 db[url + "|" + item["link"]] = data
567 data = urllib.urlencode(( \
568 ("message-id", messageid), \
569 ("created", createddate), \
570 ("contentmd5", md5sum) \
572 db[url + "|" + item["link"]] = data
576 for header in headers:
577 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
578 data.append((header[0], header[1]))
580 data = urllib.urlencode(data)
586 if __name__ == "__main__":
587 # This only gets executed if we really called the program
588 # first off, parse the command line arguments
590 oparser = OptionParser()
592 "-c", "--conf", dest="conf",
593 help="location of config file"
596 "-s", "--statedir", dest="statedir",
597 help="location of directory to store state in"
600 (options, args) = oparser.parse_args()
602 # check for the configfile
606 if options.conf != None:
607 # does the file exist?
609 os.stat(options.conf)
610 configfile = options.conf
612 # should exit here as the specified file doesn't exist
614 "Config file %s does not exist. Exiting.\n" %(options.conf,))
617 # check through the default locations
619 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
620 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
623 os.stat("/etc/rss2maildir.conf")
624 configfile = "/etc/rss2maildir.conf"
626 sys.stderr.write("No config file found. Exiting.\n")
629 # Right - if we've got this far, we've got a config file, now for the hard
632 scp = SafeConfigParser()
635 maildir_root = "RSSMaildir"
638 if options.statedir != None:
639 state_dir = options.statedir
641 mode = os.stat(state_dir)[stat.ST_MODE]
642 if not stat.S_ISDIR(mode):
644 "State directory (%s) is not a directory\n" %(state_dir))
647 # try to make the directory
651 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
653 elif scp.has_option("general", "state_dir"):
654 new_state_dir = scp.get("general", "state_dir")
656 mode = os.stat(state_dir)[stat.ST_MODE]
657 if not stat.S_ISDIR(mode):
659 "State directory (%s) is not a directory\n" %(state_dir))
664 os.mkdir(new_state_dir)
665 state_dir = new_state_dir
668 "Couldn't create state directory %s\n" %(new_state_dir))
672 mode = os.stat(state_dir)[stat.ST_MODE]
673 if not stat.S_ISDIR(mode):
675 "State directory %s is not a directory\n" %(state_dir))
682 "State directory %s could not be created\n" %(state_dir))
685 if scp.has_option("general", "maildir_root"):
686 maildir_root = scp.get("general", "maildir_root")
689 mode = os.stat(maildir_root)[stat.ST_MODE]
690 if not stat.S_ISDIR(mode):
692 "Maildir Root %s is not a directory\n" \
697 os.mkdir(maildir_root)
699 sys.stderr.write("Couldn't create Maildir Root %s\n" \
703 feeds = scp.sections()
705 feeds.remove("general")
709 for section in feeds:
710 # check if the directory exists
713 maildir = scp.get(section, "maildir")
717 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
718 maildir = os.path.join(maildir_root, maildir)
721 exists = os.stat(maildir)
722 if stat.S_ISDIR(exists[stat.ST_MODE]):
723 # check if there's a new, cur and tmp directory
725 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
727 os.mkdir(os.path.join(maildir, "cur"))
728 if not stat.S_ISDIR(mode):
729 sys.stderr.write("Broken maildir: %s\n" %(maildir))
731 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
733 os.mkdir(os.path.join(maildir, "tmp"))
734 if not stat.S_ISDIR(mode):
735 sys.stderr.write("Broken maildir: %s\n" %(maildir))
737 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
738 if not stat.S_ISDIR(mode):
739 sys.stderr.write("Broken maildir: %s\n" %(maildir))
741 os.mkdir(os.path.join(maildir, "new"))
743 sys.stderr.write("Broken maildir: %s\n" %(maildir))
748 sys.stderr.write("Couldn't create root maildir %s\n" \
752 os.mkdir(os.path.join(maildir, "new"))
753 os.mkdir(os.path.join(maildir, "cur"))
754 os.mkdir(os.path.join(maildir, "tmp"))
757 "Couldn't create required maildir directories for %s\n" \
761 # right - we've got the directories, we've got the section, we know the
764 parse_and_deliver(maildir, section, state_dir)