4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
93 def __init__(self,textwidth=70):
96 self.textwidth = textwidth
99 self.ignorenodata = False
102 HTMLParser.__init__(self)
104 def handle_starttag(self, tag, attrs):
105 tag_name = tag.lower()
106 if tag_name in self.blockleveltags:
107 # handle starting a new block - unless we're in a block element
108 # that can contain other blocks, we'll assume that we want to close
110 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
111 self.handle_curdata()
113 if tag_name == u'ol':
114 self.handle_curdata()
115 self.listcount.append(1)
116 self.listlevel = len(self.listcount) - 1
118 if tag_name in self.liststarttags:
119 smallist = self.opentags[-3:-1]
121 for prev_listtag in smallist:
122 if prev_listtag in [u'dl', u'ol']:
123 self.indentlevel = self.indentlevel + 4
125 elif prev_listtag == u'ul':
126 self.indentlevel = self.indentlevel + 3
129 if len(self.opentags) > 0:
130 self.handle_curdata()
131 if tag_name not in self.cancontainflow:
133 self.opentags.append(tag_name)
135 if tag_name == "span":
139 listcount = self.listcount[-1]
143 if tag_name == u'dd' and len(self.opentags) > 1 \
144 and self.opentags[-1] == u'dt':
145 self.handle_curdata()
147 elif tag_name == u'dt' and len(self.opentags) > 1 \
148 and self.opentags[-1] == u'dd':
149 self.handle_curdata()
151 elif tag_name == u'a':
153 if attr[0].lower() == u'href':
154 self.urls.append(attr[1])
155 self.curdata = self.curdata + u'`'
156 self.opentags.append(tag_name)
158 elif tag_name == u'img':
159 self.handle_image(attrs)
161 elif tag_name == u'br':
165 # we don't know the tag, so lets avoid handling it!
168 def handle_startendtag(self, tag, attrs):
169 if tag.lower() == u'br':
171 elif tag.lower() == u'img':
172 self.handle_image(attrs)
176 self.handle_curdata()
177 self.opentags.append(u'br')
178 self.handle_curdata()
181 def handle_image(self, attrs):
186 alt = attr[1].decode('utf-8')
187 elif attr[0] == 'src':
188 url = attr[1].decode('utf-8')
190 self.curdata = self.curdata \
194 self.curdata = self.curdata \
198 self.curdata = self.curdata \
201 def handle_curdata(self):
203 if len(self.opentags) == 0:
206 tag_thats_done = self.opentags[-1]
208 if len(self.curdata) == 0:
211 if tag_thats_done == u'br':
212 if len(self.text) == 0 or self.text[-1] != '\n':
213 self.text = self.text + '\n'
214 self.ignorenodata = True
217 if len(self.curdata.strip()) == 0:
220 if tag_thats_done in self.blockleveltags:
221 newlinerequired = self.text != u''
222 if self.ignorenodata:
223 newlinerequired = False
224 self.ignorenodata = False
226 if tag_thats_done in [u'dt', u'dd', u'li'] \
227 and len(self.text) > 1 \
228 and self.text[-1] != u'\n':
229 self.text = self.text + u'\n'
230 elif len(self.text) > 2 \
231 and self.text[-1] != u'\n' \
232 and self.text[-2] != u'\n':
233 self.text = self.text + u'\n\n'
235 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
238 headingtext = self.curdata
239 seperator = u'\n' + u' '*self.indentlevel
240 headingtext = seperator.join( \
243 self.textwidth - self.indentlevel \
247 if tag_thats_done == u'h2':
249 elif tag_thats_done != u'h1':
252 if u'\n' in headingtext:
253 underline = u' ' * self.indentlevel \
254 + underlinechar * (self.textwidth - self.indentlevel)
256 underline = u' ' * self.indentlevel \
257 + underlinechar * len(headingtext)
258 self.text = self.text \
259 + headingtext + u'\n' \
261 elif tag_thats_done in [u'p', u'div']:
262 paragraph = unicode( \
263 self.curdata.strip().encode("utf-8"), "utf-8")
264 seperator = u'\n' + u' ' * self.indentlevel
265 self.text = self.text \
266 + u' ' * self.indentlevel \
269 paragraph, self.textwidth - self.indentlevel))
270 elif tag_thats_done == "pre":
271 self.text = self.text + unicode( \
272 self.curdata.encode("utf-8"), "utf-8")
273 elif tag_thats_done == u'blockquote':
275 self.curdata.encode("utf-8").strip(), "utf-8")
276 seperator = u'\n' + u' ' * self.indentlevel + u'> '
277 if len(self.text) > 0 and self.text[-1] != u'\n':
278 self.text = self.text + u'\n'
279 self.text = self.text \
284 self.textwidth - self.indentlevel - 2 \
288 elif tag_thats_done == "li":
289 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
290 if len(self.text) > 0 and self.text[-1] != u'\n':
291 self.text = self.text + u'\n'
292 # work out if we're in an ol rather than a ul
293 latesttags = self.opentags[-4:]
296 for thing in latesttags:
310 listmarker = u' %2d. ' %(self.listcount[-1])
311 self.listcount[-1] = self.listcount[-1] + 1
314 + u' ' * self.indentlevel \
316 self.text = self.text \
317 + u' ' * self.indentlevel \
322 self.textwidth - self.indentlevel - listindent \
326 elif tag_thats_done == u'dt':
327 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
328 if len(self.text) > 0 and self.text[-1] != u'\n':
329 self.text = self.text + u'\n\n'
330 elif len(self.text) > 1 and self.text[-2] != u'\n':
331 self.text = self.text + u'\n'
332 definition = u' ' * self.indentlevel + definition + "::"
333 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
334 self.text = self.text \
336 textwrap.wrap(definition, \
337 self.textwidth - self.indentlevel - 1))
339 elif tag_thats_done == u'dd':
340 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
341 if len(definition) > 0:
342 if len(self.text) > 0 and self.text[-1] != u'\n':
343 self.text = self.text + u'\n'
344 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
345 self.text = self.text \
346 + u' ' * (self.indentlevel + 4) \
347 + indentstring.join( \
350 self.textwidth - self.indentlevel - 4 \
354 elif tag_thats_done == u'a':
355 self.curdata = self.curdata + u'`__'
357 elif tag_thats_done in self.liststarttags:
360 if tag_thats_done in self.blockleveltags:
363 self.ignorenodata = False
365 def handle_endtag(self, tag):
366 self.ignorenodata = False
371 tagindex = self.opentags.index(tag)
376 if tag in [u'br', u'img']:
379 if tag in self.liststarttags:
380 if tag in [u'ol', u'dl', u'ul']:
381 self.handle_curdata()
382 # find if there was a previous list level
383 smalllist = self.opentags[:-1]
385 for prev_listtag in smalllist:
386 if prev_listtag in [u'ol', u'dl']:
387 self.indentlevel = self.indentlevel - 4
389 elif prev_listtag == u'ul':
390 self.indentlevel = self.indentlevel - 3
394 self.listcount = self.listcount[:-1]
396 while tagindex < len(self.opentags) \
397 and tag in self.opentags[tagindex+1:]:
399 tagindex = self.opentags.index(tag, tagindex+1)
401 # well, we don't want to do that then
403 if tagindex != len(self.opentags) - 1:
404 # Assuming the data was for the last opened tag first
405 self.handle_curdata()
406 # Now kill the list to be a slice before this tag was opened
407 self.opentags = self.opentags[:tagindex + 1]
409 self.handle_curdata()
410 if self.opentags[-1] == tag:
413 def handle_data(self, data):
414 if len(self.opentags) == 0:
415 self.opentags.append(u'p')
416 self.curdata = self.curdata + data.decode("utf-8")
418 def handle_entityref(self, name):
420 if HTML2Text.entities.has_key(name.lower()):
421 entity = HTML2Text.entities[name.lower()]
423 entity = unichr(int(name[1:]))
425 entity = "&" + name + ";"
427 self.curdata = self.curdata + unicode(entity, "utf-8")
430 self.handle_curdata()
431 if len(self.text) == 0 or self.text[-1] != u'\n':
432 self.text = self.text + u'\n'
434 if len(self.text) > 0:
435 while len(self.text) > 1 and self.text[-1] == u'\n':
436 self.text = self.text[:-1]
437 self.text = self.text + u'\n'
438 if len(self.urls) > 0:
439 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
443 def open_url(method, url):
445 while redirectcount < 3:
446 (type, rest) = urllib.splittype(url)
447 (host, path) = urllib.splithost(rest)
448 (host, port) = urllib.splitport(host)
452 conn = httplib.HTTPConnection("%s:%s" %(host, port))
453 conn.request(method, path)
454 response = conn.getresponse()
455 if response.status in [301, 302, 303, 307]:
456 headers = response.getheaders()
457 for header in headers:
458 if header[0] == "location":
460 elif response.status == 200:
464 redirectcount = redirectcount + 1
467 def parse_and_deliver(maildir, url, statedir):
470 # first check if we know about this feed already
471 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
472 if feeddb.has_key(url):
474 data = cgi.parse_qs(data)
475 response = open_url("HEAD", url)
478 headers = response.getheaders()
481 for header in headers:
482 if header[0] == "content-length":
483 if header[1] != data["content-length"][0]:
485 elif header[0] == "etag":
486 if header[1] != data["etag"][0]:
488 elif header[0] == "last-modified":
489 if header[1] != data["last-modified"][0]:
491 elif header[0] == "content-md5":
492 if header[1] != data["content-md5"][0]:
497 response = open_url("GET", url)
499 headers = response.getheaders()
500 feedhandle = response
502 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
505 return # don't need to do anything, nothings changed.
507 response = open_url("GET", url)
509 headers = response.getheaders()
510 feedhandle = response
512 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
515 fp = feedparser.parse(feedhandle)
516 db = dbm.open(os.path.join(statedir, "seen"), "c")
517 for item in fp["items"]:
518 # have we seen it before?
519 # need to work out what the content is first...
521 if item.has_key("content"):
522 content = item["content"][0]["value"]
524 content = item["summary"]
526 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
530 # check if there's a guid too - if that exists and we match the md5,
532 if item.has_key("guid"):
533 if db.has_key(url + "|" + item["guid"]):
534 data = db[url + "|" + item["guid"]]
535 data = cgi.parse_qs(data)
536 if data["contentmd5"][0] == md5sum:
539 if db.has_key(url + "|" + item["link"]):
540 data = db[url + "|" + item["link"]]
541 data = cgi.parse_qs(data)
542 if data.has_key("message-id"):
543 prevmessageid = data["message-id"][0]
544 if data["contentmd5"][0] == md5sum:
548 author = item["author"]
552 # create a basic email message
553 msg = MIMEMultipart("alternative")
555 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
559 string.ascii_letters + string.digits \
560 ) for a in range(0,6) \
561 ]) + "@" + socket.gethostname() + ">"
562 msg.add_header("Message-ID", messageid)
563 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
564 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
565 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
567 msg.add_header("References", prevmessageid)
568 createddate = datetime.datetime.now() \
569 .strftime("%a, %e %b %Y %T -0000")
571 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
572 .strftime("%a, %e %b %Y %T -0000")
575 msg.add_header("Date", createddate)
576 msg.add_header("Subject", item["title"])
577 msg.set_default_type("text/plain")
579 htmlcontent = content.encode("utf-8")
580 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
584 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
585 textparser = HTML2Text()
586 textparser.feed(content.encode("utf-8"))
587 textcontent = textparser.gettext()
588 textcontent = "%s\n\nItem URL: %s" %( \
591 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
595 # start by working out the filename we should be writting to, we do
596 # this following the normal maildir style rules
597 fname = str(os.getpid()) \
598 + "." + socket.gethostname() \
601 string.ascii_letters + string.digits \
602 ) for a in range(0,10) \
604 + datetime.datetime.now().strftime('%s')
605 fn = os.path.join(maildir, "tmp", fname)
607 fh.write(msg.as_string())
609 # now move it in to the new directory
610 newfn = os.path.join(maildir, "new", fname)
614 # now add to the database about the item
616 messageid = prevmessageid + " " + messageid
617 if item.has_key("guid") and item["guid"] != item["link"]:
618 data = urllib.urlencode(( \
619 ("message-id", messageid), \
620 ("created", createddate), \
621 ("contentmd5", md5sum) \
623 db[url + "|" + item["guid"]] = data
625 data = db[url + "|" + item["link"]]
626 data = cgi.parse_qs(data)
627 newdata = urllib.urlencode(( \
628 ("message-id", messageid), \
629 ("created", data["created"][0]), \
630 ("contentmd5", data["contentmd5"][0]) \
632 db[url + "|" + item["link"]] = newdata
634 db[url + "|" + item["link"]] = data
636 data = urllib.urlencode(( \
637 ("message-id", messageid), \
638 ("created", createddate), \
639 ("contentmd5", md5sum) \
641 db[url + "|" + item["link"]] = data
645 for header in headers:
647 ["content-md5", "etag", "last-modified", "content-length"]:
648 data.append((header[0], header[1]))
650 data = urllib.urlencode(data)
656 if __name__ == "__main__":
657 # This only gets executed if we really called the program
658 # first off, parse the command line arguments
660 oparser = OptionParser()
662 "-c", "--conf", dest="conf",
663 help="location of config file"
666 "-s", "--statedir", dest="statedir",
667 help="location of directory to store state in"
670 (options, args) = oparser.parse_args()
672 # check for the configfile
676 if options.conf != None:
677 # does the file exist?
679 os.stat(options.conf)
680 configfile = options.conf
682 # should exit here as the specified file doesn't exist
684 "Config file %s does not exist. Exiting.\n" %(options.conf,))
687 # check through the default locations
689 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
690 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
693 os.stat("/etc/rss2maildir.conf")
694 configfile = "/etc/rss2maildir.conf"
696 sys.stderr.write("No config file found. Exiting.\n")
699 # Right - if we've got this far, we've got a config file, now for the hard
702 scp = SafeConfigParser()
705 maildir_root = "RSSMaildir"
708 if options.statedir != None:
709 state_dir = options.statedir
711 mode = os.stat(state_dir)[stat.ST_MODE]
712 if not stat.S_ISDIR(mode):
714 "State directory (%s) is not a directory\n" %(state_dir))
717 # try to make the directory
721 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
723 elif scp.has_option("general", "state_dir"):
724 new_state_dir = scp.get("general", "state_dir")
726 mode = os.stat(new_state_dir)[stat.ST_MODE]
727 if not stat.S_ISDIR(mode):
729 "State directory (%s) is not a directory\n" %(state_dir))
732 state_dir = new_state_dir
736 os.mkdir(new_state_dir)
737 state_dir = new_state_dir
740 "Couldn't create state directory %s\n" %(new_state_dir))
744 mode = os.stat(state_dir)[stat.ST_MODE]
745 if not stat.S_ISDIR(mode):
747 "State directory %s is not a directory\n" %(state_dir))
754 "State directory %s could not be created\n" %(state_dir))
757 if scp.has_option("general", "maildir_root"):
758 maildir_root = scp.get("general", "maildir_root")
761 mode = os.stat(maildir_root)[stat.ST_MODE]
762 if not stat.S_ISDIR(mode):
764 "Maildir Root %s is not a directory\n" \
769 os.mkdir(maildir_root)
771 sys.stderr.write("Couldn't create Maildir Root %s\n" \
775 feeds = scp.sections()
777 feeds.remove("general")
781 for section in feeds:
782 # check if the directory exists
785 maildir = scp.get(section, "maildir")
789 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
790 maildir = os.path.join(maildir_root, maildir)
793 exists = os.stat(maildir)
794 if stat.S_ISDIR(exists[stat.ST_MODE]):
795 # check if there's a new, cur and tmp directory
797 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
799 os.mkdir(os.path.join(maildir, "cur"))
800 if not stat.S_ISDIR(mode):
801 sys.stderr.write("Broken maildir: %s\n" %(maildir))
803 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
805 os.mkdir(os.path.join(maildir, "tmp"))
806 if not stat.S_ISDIR(mode):
807 sys.stderr.write("Broken maildir: %s\n" %(maildir))
809 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
810 if not stat.S_ISDIR(mode):
811 sys.stderr.write("Broken maildir: %s\n" %(maildir))
813 os.mkdir(os.path.join(maildir, "new"))
815 sys.stderr.write("Broken maildir: %s\n" %(maildir))
820 sys.stderr.write("Couldn't create root maildir %s\n" \
824 os.mkdir(os.path.join(maildir, "new"))
825 os.mkdir(os.path.join(maildir, "cur"))
826 os.mkdir(os.path.join(maildir, "tmp"))
829 "Couldn't create required maildir directories for %s\n" \
833 # right - we've got the directories, we've got the section, we know the
836 parse_and_deliver(maildir, section, state_dir)