4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
93 def __init__(self,textwidth=70):
96 self.textwidth = textwidth
99 self.ignorenodata = False
103 HTMLParser.__init__(self)
105 def handle_starttag(self, tag, attrs):
106 tag_name = tag.lower()
107 if tag_name in self.blockleveltags:
108 # handle starting a new block - unless we're in a block element
109 # that can contain other blocks, we'll assume that we want to close
111 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
112 self.handle_curdata()
114 if tag_name == u'ol':
115 self.handle_curdata()
116 self.listcount.append(1)
117 self.listlevel = len(self.listcount) - 1
119 if tag_name in self.liststarttags:
120 smallist = self.opentags[-3:-1]
122 for prev_listtag in smallist:
123 if prev_listtag in [u'dl', u'ol']:
124 self.indentlevel = self.indentlevel + 4
126 elif prev_listtag == u'ul':
127 self.indentlevel = self.indentlevel + 3
130 if len(self.opentags) > 0:
131 self.handle_curdata()
132 if tag_name not in self.cancontainflow:
134 self.opentags.append(tag_name)
136 if tag_name == "span":
140 listcount = self.listcount[-1]
144 if tag_name == u'dd' and len(self.opentags) > 1 \
145 and self.opentags[-1] == u'dt':
146 self.handle_curdata()
148 elif tag_name == u'dt' and len(self.opentags) > 1 \
149 and self.opentags[-1] == u'dd':
150 self.handle_curdata()
152 elif tag_name == u'a':
154 if attr[0].lower() == u'href':
155 self.urls.append(attr[1].decode('utf-8'))
156 self.curdata = self.curdata + u'`'
157 self.opentags.append(tag_name)
159 elif tag_name == u'img':
160 self.handle_image(attrs)
162 elif tag_name == u'br':
166 # we don't know the tag, so lets avoid handling it!
169 def handle_startendtag(self, tag, attrs):
170 if tag.lower() == u'br':
172 elif tag.lower() == u'img':
173 self.handle_image(attrs)
177 self.handle_curdata()
178 self.opentags.append(u'br')
179 self.handle_curdata()
182 def handle_image(self, attrs):
187 alt = attr[1].decode('utf-8')
188 elif attr[0] == 'src':
189 url = attr[1].decode('utf-8')
192 if self.images.has_key(alt):
193 if self.images[alt]["url"] == url:
194 self.curdata = self.curdata \
197 while self.images.has_key(alt):
199 self.images[alt]["url"] = url
200 self.curdata = self.curdata \
203 self.images[alt] = {}
204 self.images[alt]["url"] = url
205 self.curdata = self.curdata \
208 if self.images.has_key(url):
209 self.curdata = self.curdata \
212 self.images[url] = {}
213 self.images[url]["url"] =url
214 self.curdata = self.curdata \
217 def handle_curdata(self):
219 if len(self.opentags) == 0:
222 tag_thats_done = self.opentags[-1]
224 if len(self.curdata) == 0:
227 if tag_thats_done == u'br':
228 if len(self.text) == 0 or self.text[-1] != '\n':
229 self.text = self.text + '\n'
230 self.ignorenodata = True
233 if len(self.curdata.strip()) == 0:
236 if tag_thats_done in self.blockleveltags:
237 newlinerequired = self.text != u''
238 if self.ignorenodata:
239 newlinerequired = False
240 self.ignorenodata = False
242 if tag_thats_done in [u'dt', u'dd', u'li'] \
243 and len(self.text) > 1 \
244 and self.text[-1] != u'\n':
245 self.text = self.text + u'\n'
246 elif len(self.text) > 2 \
247 and self.text[-1] != u'\n' \
248 and self.text[-2] != u'\n':
249 self.text = self.text + u'\n\n'
251 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
254 headingtext = " ".join(self.curdata.split())
255 seperator = u'\n' + u' '*self.indentlevel
256 headingtext = seperator.join( \
259 self.textwidth - self.indentlevel \
263 if tag_thats_done == u'h2':
265 elif tag_thats_done != u'h1':
268 if u'\n' in headingtext:
269 underline = u' ' * self.indentlevel \
270 + underlinechar * (self.textwidth - self.indentlevel)
272 underline = u' ' * self.indentlevel \
273 + underlinechar * len(headingtext)
274 self.text = self.text \
275 + headingtext + u'\n' \
277 elif tag_thats_done in [u'p', u'div']:
278 paragraph = unicode( \
279 " ".join(self.curdata.strip().encode("utf-8").split()), \
281 seperator = u'\n' + u' ' * self.indentlevel
282 self.text = self.text \
283 + u' ' * self.indentlevel \
286 paragraph, self.textwidth - self.indentlevel))
287 elif tag_thats_done == "pre":
288 self.text = self.text + unicode( \
289 self.curdata.encode("utf-8"), "utf-8")
290 elif tag_thats_done == u'blockquote':
292 " ".join(self.curdata.encode("utf-8").strip().split()), \
294 seperator = u'\n' + u' ' * self.indentlevel + u'> '
295 if len(self.text) > 0 and self.text[-1] != u'\n':
296 self.text = self.text + u'\n'
297 self.text = self.text \
302 self.textwidth - self.indentlevel - 2 \
306 elif tag_thats_done == "li":
307 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
308 if len(self.text) > 0 and self.text[-1] != u'\n':
309 self.text = self.text + u'\n'
310 # work out if we're in an ol rather than a ul
311 latesttags = self.opentags[-4:]
314 for thing in latesttags:
328 listmarker = u' %2d. ' %(self.listcount[-1])
329 self.listcount[-1] = self.listcount[-1] + 1
332 + u' ' * self.indentlevel \
334 self.text = self.text \
335 + u' ' * self.indentlevel \
340 self.textwidth - self.indentlevel - listindent \
344 elif tag_thats_done == u'dt':
345 definition = unicode(" ".join( \
346 self.curdata.encode("utf-8").strip().split()), \
348 if len(self.text) > 0 and self.text[-1] != u'\n':
349 self.text = self.text + u'\n\n'
350 elif len(self.text) > 1 and self.text[-2] != u'\n':
351 self.text = self.text + u'\n'
352 definition = u' ' * self.indentlevel + definition + "::"
353 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
354 self.text = self.text \
356 textwrap.wrap(definition, \
357 self.textwidth - self.indentlevel - 1))
359 elif tag_thats_done == u'dd':
360 definition = unicode(" ".join( \
361 self.curdata.encode("utf-8").strip().split()),
363 if len(definition) > 0:
364 if len(self.text) > 0 and self.text[-1] != u'\n':
365 self.text = self.text + u'\n'
366 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
367 self.text = self.text \
368 + u' ' * (self.indentlevel + 4) \
369 + indentstring.join( \
372 self.textwidth - self.indentlevel - 4 \
376 elif tag_thats_done == u'a':
377 self.curdata = self.curdata + u'`__'
379 elif tag_thats_done in self.liststarttags:
382 if tag_thats_done in self.blockleveltags:
385 self.ignorenodata = False
387 def handle_endtag(self, tag):
388 self.ignorenodata = False
393 tagindex = self.opentags.index(tag)
398 if tag in [u'br', u'img']:
401 if tag in self.liststarttags:
402 if tag in [u'ol', u'dl', u'ul']:
403 self.handle_curdata()
404 # find if there was a previous list level
405 smalllist = self.opentags[:-1]
407 for prev_listtag in smalllist:
408 if prev_listtag in [u'ol', u'dl']:
409 self.indentlevel = self.indentlevel - 4
411 elif prev_listtag == u'ul':
412 self.indentlevel = self.indentlevel - 3
416 self.listcount = self.listcount[:-1]
418 while tagindex < len(self.opentags) \
419 and tag in self.opentags[tagindex+1:]:
421 tagindex = self.opentags.index(tag, tagindex+1)
423 # well, we don't want to do that then
425 if tagindex != len(self.opentags) - 1:
426 # Assuming the data was for the last opened tag first
427 self.handle_curdata()
428 # Now kill the list to be a slice before this tag was opened
429 self.opentags = self.opentags[:tagindex + 1]
431 self.handle_curdata()
432 if self.opentags[-1] == tag:
435 def handle_data(self, data):
436 if len(self.opentags) == 0:
437 self.opentags.append(u'p')
438 self.curdata = self.curdata + data.decode("utf-8")
440 def handle_entityref(self, name):
442 if HTML2Text.entities.has_key(name.lower()):
443 entity = HTML2Text.entities[name.lower()]
445 entity = unichr(int(name[1:]))
447 entity = "&" + name + ";"
449 self.curdata = self.curdata + unicode(entity, "utf-8")
452 self.handle_curdata()
453 if len(self.text) == 0 or self.text[-1] != u'\n':
454 self.text = self.text + u'\n'
456 if len(self.text) > 0:
457 while len(self.text) > 1 and self.text[-1] == u'\n':
458 self.text = self.text[:-1]
459 self.text = self.text + u'\n'
460 if len(self.urls) > 0:
461 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
463 if len(self.images.keys()) > 0:
464 self.text = self.text + u'\n.. ' \
466 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
467 for a in self.images.keys()]) + u'\n'
471 def open_url(method, url):
473 while redirectcount < 3:
474 (type, rest) = urllib.splittype(url)
475 (host, path) = urllib.splithost(rest)
476 (host, port) = urllib.splitport(host)
480 conn = httplib.HTTPConnection("%s:%s" %(host, port))
481 conn.request(method, path)
482 response = conn.getresponse()
483 if response.status in [301, 302, 303, 307]:
484 headers = response.getheaders()
485 for header in headers:
486 if header[0] == "location":
488 elif response.status == 200:
492 redirectcount = redirectcount + 1
495 def parse_and_deliver(maildir, url, statedir):
498 # first check if we know about this feed already
499 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
500 if feeddb.has_key(url):
502 data = cgi.parse_qs(data)
503 response = open_url("HEAD", url)
506 headers = response.getheaders()
509 for header in headers:
510 if header[0] == "content-length":
511 if header[1] != data["content-length"][0]:
513 elif header[0] == "etag":
514 if header[1] != data["etag"][0]:
516 elif header[0] == "last-modified":
517 if header[1] != data["last-modified"][0]:
519 elif header[0] == "content-md5":
520 if header[1] != data["content-md5"][0]:
525 response = open_url("GET", url)
527 headers = response.getheaders()
528 feedhandle = response
530 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
533 return # don't need to do anything, nothings changed.
535 response = open_url("GET", url)
537 headers = response.getheaders()
538 feedhandle = response
540 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
543 fp = feedparser.parse(feedhandle)
544 db = dbm.open(os.path.join(statedir, "seen"), "c")
545 for item in fp["items"]:
546 # have we seen it before?
547 # need to work out what the content is first...
549 if item.has_key("content"):
550 content = item["content"][0]["value"]
552 content = item["summary"]
554 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
558 # check if there's a guid too - if that exists and we match the md5,
560 if item.has_key("guid"):
561 if db.has_key(url + "|" + item["guid"]):
562 data = db[url + "|" + item["guid"]]
563 data = cgi.parse_qs(data)
564 if data["contentmd5"][0] == md5sum:
567 if db.has_key(url + "|" + item["link"]):
568 data = db[url + "|" + item["link"]]
569 data = cgi.parse_qs(data)
570 if data.has_key("message-id"):
571 prevmessageid = data["message-id"][0]
572 if data["contentmd5"][0] == md5sum:
576 author = item["author"]
580 # create a basic email message
581 msg = MIMEMultipart("alternative")
583 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
587 string.ascii_letters + string.digits \
588 ) for a in range(0,6) \
589 ]) + "@" + socket.gethostname() + ">"
590 msg.add_header("Message-ID", messageid)
591 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
592 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
593 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
595 msg.add_header("References", prevmessageid)
596 createddate = datetime.datetime.now() \
597 .strftime("%a, %e %b %Y %T -0000")
599 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
600 .strftime("%a, %e %b %Y %T -0000")
603 msg.add_header("Date", createddate)
604 msg.add_header("Subject", item["title"])
605 msg.set_default_type("text/plain")
607 htmlcontent = content.encode("utf-8")
608 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
612 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
613 textparser = HTML2Text()
614 textparser.feed(content.encode("utf-8"))
615 textcontent = textparser.gettext()
616 textcontent = "%s\n\nItem URL: %s" %( \
619 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
623 # start by working out the filename we should be writting to, we do
624 # this following the normal maildir style rules
625 fname = str(os.getpid()) \
626 + "." + socket.gethostname() \
629 string.ascii_letters + string.digits \
630 ) for a in range(0,10) \
632 + datetime.datetime.now().strftime('%s')
633 fn = os.path.join(maildir, "tmp", fname)
635 fh.write(msg.as_string())
637 # now move it in to the new directory
638 newfn = os.path.join(maildir, "new", fname)
642 # now add to the database about the item
644 messageid = prevmessageid + " " + messageid
645 if item.has_key("guid") and item["guid"] != item["link"]:
646 data = urllib.urlencode(( \
647 ("message-id", messageid), \
648 ("created", createddate), \
649 ("contentmd5", md5sum) \
651 db[url + "|" + item["guid"]] = data
653 data = db[url + "|" + item["link"]]
654 data = cgi.parse_qs(data)
655 newdata = urllib.urlencode(( \
656 ("message-id", messageid), \
657 ("created", data["created"][0]), \
658 ("contentmd5", data["contentmd5"][0]) \
660 db[url + "|" + item["link"]] = newdata
662 db[url + "|" + item["link"]] = data
664 data = urllib.urlencode(( \
665 ("message-id", messageid), \
666 ("created", createddate), \
667 ("contentmd5", md5sum) \
669 db[url + "|" + item["link"]] = data
673 for header in headers:
675 ["content-md5", "etag", "last-modified", "content-length"]:
676 data.append((header[0], header[1]))
678 data = urllib.urlencode(data)
684 if __name__ == "__main__":
685 # This only gets executed if we really called the program
686 # first off, parse the command line arguments
688 oparser = OptionParser()
690 "-c", "--conf", dest="conf",
691 help="location of config file"
694 "-s", "--statedir", dest="statedir",
695 help="location of directory to store state in"
698 (options, args) = oparser.parse_args()
700 # check for the configfile
704 if options.conf != None:
705 # does the file exist?
707 os.stat(options.conf)
708 configfile = options.conf
710 # should exit here as the specified file doesn't exist
712 "Config file %s does not exist. Exiting.\n" %(options.conf,))
715 # check through the default locations
717 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
718 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
721 os.stat("/etc/rss2maildir.conf")
722 configfile = "/etc/rss2maildir.conf"
724 sys.stderr.write("No config file found. Exiting.\n")
727 # Right - if we've got this far, we've got a config file, now for the hard
730 scp = SafeConfigParser()
733 maildir_root = "RSSMaildir"
736 if options.statedir != None:
737 state_dir = options.statedir
739 mode = os.stat(state_dir)[stat.ST_MODE]
740 if not stat.S_ISDIR(mode):
742 "State directory (%s) is not a directory\n" %(state_dir))
745 # try to make the directory
749 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
751 elif scp.has_option("general", "state_dir"):
752 new_state_dir = scp.get("general", "state_dir")
754 mode = os.stat(new_state_dir)[stat.ST_MODE]
755 if not stat.S_ISDIR(mode):
757 "State directory (%s) is not a directory\n" %(state_dir))
760 state_dir = new_state_dir
764 os.mkdir(new_state_dir)
765 state_dir = new_state_dir
768 "Couldn't create state directory %s\n" %(new_state_dir))
772 mode = os.stat(state_dir)[stat.ST_MODE]
773 if not stat.S_ISDIR(mode):
775 "State directory %s is not a directory\n" %(state_dir))
782 "State directory %s could not be created\n" %(state_dir))
785 if scp.has_option("general", "maildir_root"):
786 maildir_root = scp.get("general", "maildir_root")
789 mode = os.stat(maildir_root)[stat.ST_MODE]
790 if not stat.S_ISDIR(mode):
792 "Maildir Root %s is not a directory\n" \
797 os.mkdir(maildir_root)
799 sys.stderr.write("Couldn't create Maildir Root %s\n" \
803 feeds = scp.sections()
805 feeds.remove("general")
809 for section in feeds:
810 # check if the directory exists
813 maildir = scp.get(section, "maildir")
817 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
818 maildir = os.path.join(maildir_root, maildir)
821 exists = os.stat(maildir)
822 if stat.S_ISDIR(exists[stat.ST_MODE]):
823 # check if there's a new, cur and tmp directory
825 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
827 os.mkdir(os.path.join(maildir, "cur"))
828 if not stat.S_ISDIR(mode):
829 sys.stderr.write("Broken maildir: %s\n" %(maildir))
831 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
833 os.mkdir(os.path.join(maildir, "tmp"))
834 if not stat.S_ISDIR(mode):
835 sys.stderr.write("Broken maildir: %s\n" %(maildir))
837 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
838 if not stat.S_ISDIR(mode):
839 sys.stderr.write("Broken maildir: %s\n" %(maildir))
841 os.mkdir(os.path.join(maildir, "new"))
843 sys.stderr.write("Broken maildir: %s\n" %(maildir))
848 sys.stderr.write("Couldn't create root maildir %s\n" \
852 os.mkdir(os.path.join(maildir, "new"))
853 os.mkdir(os.path.join(maildir, "cur"))
854 os.mkdir(os.path.join(maildir, "tmp"))
857 "Couldn't create required maildir directories for %s\n" \
861 # right - we've got the directories, we've got the section, we know the
864 parse_and_deliver(maildir, section, state_dir)