4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
90 def __init__(self,textwidth=70):
93 self.textwidth = textwidth
96 self.ignorenodata = False
99 HTMLParser.__init__(self)
101 def handle_starttag(self, tag, attrs):
102 tag_name = tag.lower()
103 if tag_name in self.blockleveltags:
104 # handle starting a new block - unless we're in a block element
105 # that can contain other blocks, we'll assume that we want to close
107 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
108 self.handle_curdata()
110 if tag_name == u'ol':
111 self.handle_curdata()
112 self.listcount.append(1)
113 self.listlevel = len(self.listcount) - 1
115 if tag_name in self.liststarttags:
116 smallist = self.opentags[-3:-1]
118 for prev_listtag in smallist:
119 if prev_listtag in [u'dl', u'ol']:
120 self.indentlevel = self.indentlevel + 4
122 elif prev_listtag == u'ul':
123 self.indentlevel = self.indentlevel + 3
126 if len(self.opentags) > 0:
127 self.handle_curdata()
128 if tag_name not in self.cancontainflow:
130 self.opentags.append(tag_name)
132 if tag_name == "span":
136 listcount = self.listcount[-1]
140 if tag_name == u'dd' and len(self.opentags) > 1 \
141 and self.opentags[-1] == u'dt':
142 self.handle_curdata()
144 elif tag_name == u'dt' and len(self.opentags) > 1 \
145 and self.opentags[-1] == u'dd':
146 self.handle_curdata()
148 elif tag_name == u'a':
150 if attr[0].lower() == u'href':
151 self.urls.append(attr[1])
152 self.curdata = self.curdata + u'`'
153 self.opentags.append(tag_name)
155 elif tag_name == u'img':
156 self.handle_image(attrs)
158 elif tag_name == u'br':
162 # we don't know the tag, so lets avoid handling it!
165 def handle_startendtag(self, tag, attrs):
166 if tag.lower() == u'br':
168 elif tag.lower() == u'img':
169 self.handle_image(attrs)
173 self.handle_curdata()
174 self.opentags.append(u'br')
175 self.handle_curdata()
178 def handle_image(self, attrs):
183 alt = attr[1].decode('utf-8')
184 elif attr[0] == 'src':
185 url = attr[1].decode('utf-8')
187 self.curdata = self.curdata \
191 self.curdata = self.curdata \
195 self.curdata = self.curdata \
198 def handle_curdata(self):
200 if len(self.opentags) == 0:
203 tag_thats_done = self.opentags[-1]
205 if len(self.curdata) == 0:
208 if tag_thats_done == u'br':
209 if len(self.text) == 0 or self.text[-1] != '\n':
210 self.text = self.text + '\n'
211 self.ignorenodata = True
214 if len(self.curdata.strip()) == 0:
217 if tag_thats_done in self.blockleveltags:
218 newlinerequired = self.text != u''
219 if self.ignorenodata:
220 newlinerequired = False
221 self.ignorenodata = False
223 and len(self.text) > 2 \
224 and self.text[-1] != u'\n' \
225 and self.text[-2] != u'\n':
226 self.text = self.text + u'\n\n'
228 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
231 headingtext = unicode( \
232 self.curdata.encode("utf-8").strip(), "utf-8")
233 seperator = u'\n' + u' '*self.indentlevel
234 headingtext = seperator.join( \
237 self.textwidth - self.indentlevel \
241 if tag_thats_done == u'h2':
243 elif tag_thats_done != u'h1':
246 if u'\n' in headingtext:
247 underline = u' ' * self.indentlevel \
248 + underlinechar * (self.textwidth - self.indentlevel)
250 underline = u' ' * self.indentlevel \
251 + underlinechar * len(headingtext)
252 self.text = self.text \
253 + headingtext.encode("utf-8") + u'\n' \
255 elif tag_thats_done in [u'p', u'div']:
256 paragraph = unicode( \
257 self.curdata.strip().encode("utf-8"), "utf-8")
258 seperator = u'\n' + u' ' * self.indentlevel
259 self.text = self.text \
260 + u' ' * self.indentlevel \
263 paragraph, self.textwidth - self.indentlevel))
264 elif tag_thats_done == "pre":
265 self.text = self.text + unicode( \
266 self.curdata.encode("utf-8"), "utf-8")
267 elif tag_thats_done == u'blockquote':
269 self.curdata.encode("utf-8").strip(), "utf-8")
270 seperator = u'\n' + u' ' * self.indentlevel + u'> '
271 if len(self.text) > 0 and self.text[-1] != u'\n':
272 self.text = self.text + u'\n'
273 self.text = self.text \
278 self.textwidth - self.indentlevel - 2 \
282 elif tag_thats_done == "li":
283 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
284 if len(self.text) > 0 and self.text[-1] != u'\n':
285 self.text = self.text + u'\n'
286 # work out if we're in an ol rather than a ul
287 latesttags = self.opentags[-4:]
290 for thing in latesttags:
304 listmarker = u' %2d. ' %(self.listcount[-1])
305 self.listcount[-1] = self.listcount[-1] + 1
308 + u' ' * self.indentlevel \
310 self.text = self.text \
311 + u' ' * self.indentlevel \
316 self.textwidth - self.indentlevel - listindent \
320 elif tag_thats_done == u'dt':
321 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
322 if len(self.text) > 0 and self.text[-1] != u'\n':
323 self.text = self.text + u'\n\n'
324 elif len(self.text) > 1 and self.text[-2] != u'\n':
325 self.text = self.text + u'\n'
326 definition = u' ' * self.indentlevel + definition + "::"
327 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
328 self.text = self.text \
330 textwrap.wrap(definition, \
331 self.textwidth - self.indentlevel - 1))
333 elif tag_thats_done == u'dd':
334 definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
335 if len(definition) > 0:
336 if len(self.text) > 0 and self.text[-1] != u'\n':
337 self.text = self.text + u'\n'
338 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
339 self.text = self.text \
340 + u' ' * (self.indentlevel + 4) \
341 + indentstring.join( \
344 self.textwidth - self.indentlevel - 4 \
348 elif tag_thats_done == u'a':
349 self.curdata = self.curdata + u'`__'
351 elif tag_thats_done in self.liststarttags:
354 if tag_thats_done in self.blockleveltags:
357 self.ignorenodata = False
359 def handle_endtag(self, tag):
360 self.ignorenodata = False
365 tagindex = self.opentags.index(tag)
370 if tag in [u'br', u'img']:
373 if tag in self.liststarttags:
374 if tag in [u'ol', u'dl', u'ul']:
375 self.handle_curdata()
376 # find if there was a previous list level
377 smalllist = self.opentags[:-1]
379 for prev_listtag in smalllist:
380 if prev_listtag in [u'ol', u'dl']:
381 self.indentlevel = self.indentlevel - 4
383 elif prev_listtag == u'ul':
384 self.indentlevel = self.indentlevel - 3
388 self.listcount = self.listcount[:-1]
390 while tagindex < len(self.opentags) \
391 and tag in self.opentags[tagindex+1:]:
393 tagindex = self.opentags.index(tag, tagindex+1)
395 # well, we don't want to do that then
397 if tagindex != len(self.opentags) - 1:
398 # Assuming the data was for the last opened tag first
399 self.handle_curdata()
400 # Now kill the list to be a slice before this tag was opened
401 self.opentags = self.opentags[:tagindex + 1]
403 self.handle_curdata()
404 if self.opentags[-1] == tag:
407 def handle_data(self, data):
408 if len(self.opentags) == 0:
409 self.opentags.append(u'p')
410 self.curdata = self.curdata + unicode(data, "utf-8")
412 def handle_entityref(self, name):
414 if HTML2Text.entities.has_key(name.lower()):
415 entity = HTML2Text.entities[name.lower()]
417 entity = unichr(int(name[1:]))
419 entity = "&" + name + ";"
421 self.curdata = self.curdata + unicode(entity, "utf-8")
424 self.handle_curdata()
425 if len(self.text) == 0 or self.text[-1] != u'\n':
426 self.text = self.text + u'\n'
428 if len(self.text) > 0:
429 while len(self.text) > 1 and self.text[-1] == u'\n':
430 self.text = self.text[:-1]
431 self.text = self.text + u'\n'
432 if len(self.urls) > 0:
433 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
437 def open_url(method, url):
439 while redirectcount < 3:
440 (type, rest) = urllib.splittype(url)
441 (host, path) = urllib.splithost(rest)
442 (host, port) = urllib.splitport(host)
446 conn = httplib.HTTPConnection("%s:%s" %(host, port))
447 conn.request(method, path)
448 response = conn.getresponse()
449 if response.status in [301, 302, 303, 307]:
450 headers = response.getheaders()
451 for header in headers:
452 if header[0] == "location":
454 elif response.status == 200:
458 redirectcount = redirectcount + 1
461 def parse_and_deliver(maildir, url, statedir):
464 # first check if we know about this feed already
465 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
466 if feeddb.has_key(url):
468 data = cgi.parse_qs(data)
469 response = open_url("HEAD", url)
472 headers = response.getheaders()
475 for header in headers:
476 if header[0] == "content-length":
477 if header[1] != data["content-length"][0]:
479 elif header[0] == "etag":
480 if header[1] != data["etag"][0]:
482 elif header[0] == "last-modified":
483 if header[1] != data["last-modified"][0]:
485 elif header[0] == "content-md5":
486 if header[1] != data["content-md5"][0]:
491 response = open_url("GET", url)
493 headers = response.getheaders()
494 feedhandle = response
496 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
499 return # don't need to do anything, nothings changed.
501 response = open_url("GET", url)
503 headers = response.getheaders()
504 feedhandle = response
506 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
509 fp = feedparser.parse(feedhandle)
510 db = dbm.open(os.path.join(statedir, "seen"), "c")
511 for item in fp["items"]:
512 # have we seen it before?
513 # need to work out what the content is first...
515 if item.has_key("content"):
516 content = item["content"][0]["value"]
518 content = item["summary"]
520 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
524 # check if there's a guid too - if that exists and we match the md5,
526 if item.has_key("guid"):
527 if db.has_key(url + "|" + item["guid"]):
528 data = db[url + "|" + item["guid"]]
529 data = cgi.parse_qs(data)
530 if data["contentmd5"][0] == md5sum:
533 if db.has_key(url + "|" + item["link"]):
534 data = db[url + "|" + item["link"]]
535 data = cgi.parse_qs(data)
536 if data.has_key("message-id"):
537 prevmessageid = data["message-id"][0]
538 if data["contentmd5"][0] == md5sum:
542 author = item["author"]
546 # create a basic email message
547 msg = MIMEMultipart("alternative")
549 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
553 string.ascii_letters + string.digits \
554 ) for a in range(0,6) \
555 ]) + "@" + socket.gethostname() + ">"
556 msg.add_header("Message-ID", messageid)
557 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
558 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
559 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
561 msg.add_header("References", prevmessageid)
562 createddate = datetime.datetime.now() \
563 .strftime("%a, %e %b %Y %T -0000")
565 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
566 .strftime("%a, %e %b %Y %T -0000")
569 msg.add_header("Date", createddate)
570 msg.add_header("Subject", item["title"])
571 msg.set_default_type("text/plain")
573 htmlcontent = content.encode("utf-8")
574 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
578 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
579 textparser = HTML2Text()
580 textparser.feed(content.encode("utf-8"))
581 textcontent = textparser.gettext()
582 textcontent = "%s\n\nItem URL: %s" %( \
585 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
589 # start by working out the filename we should be writting to, we do
590 # this following the normal maildir style rules
591 fname = str(os.getpid()) \
592 + "." + socket.gethostname() \
595 string.ascii_letters + string.digits \
596 ) for a in range(0,10) \
598 + datetime.datetime.now().strftime('%s')
599 fn = os.path.join(maildir, "tmp", fname)
601 fh.write(msg.as_string())
603 # now move it in to the new directory
604 newfn = os.path.join(maildir, "new", fname)
608 # now add to the database about the item
610 messageid = prevmessageid + " " + messageid
611 if item.has_key("guid") and item["guid"] != item["link"]:
612 data = urllib.urlencode(( \
613 ("message-id", messageid), \
614 ("created", createddate), \
615 ("contentmd5", md5sum) \
617 db[url + "|" + item["guid"]] = data
619 data = db[url + "|" + item["link"]]
620 data = cgi.parse_qs(data)
621 newdata = urllib.urlencode(( \
622 ("message-id", messageid), \
623 ("created", data["created"][0]), \
624 ("contentmd5", data["contentmd5"][0]) \
626 db[url + "|" + item["link"]] = newdata
628 db[url + "|" + item["link"]] = data
630 data = urllib.urlencode(( \
631 ("message-id", messageid), \
632 ("created", createddate), \
633 ("contentmd5", md5sum) \
635 db[url + "|" + item["link"]] = data
639 for header in headers:
641 ["content-md5", "etag", "last-modified", "content-length"]:
642 data.append((header[0], header[1]))
644 data = urllib.urlencode(data)
650 if __name__ == "__main__":
651 # This only gets executed if we really called the program
652 # first off, parse the command line arguments
654 oparser = OptionParser()
656 "-c", "--conf", dest="conf",
657 help="location of config file"
660 "-s", "--statedir", dest="statedir",
661 help="location of directory to store state in"
664 (options, args) = oparser.parse_args()
666 # check for the configfile
670 if options.conf != None:
671 # does the file exist?
673 os.stat(options.conf)
674 configfile = options.conf
676 # should exit here as the specified file doesn't exist
678 "Config file %s does not exist. Exiting.\n" %(options.conf,))
681 # check through the default locations
683 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
684 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
687 os.stat("/etc/rss2maildir.conf")
688 configfile = "/etc/rss2maildir.conf"
690 sys.stderr.write("No config file found. Exiting.\n")
693 # Right - if we've got this far, we've got a config file, now for the hard
696 scp = SafeConfigParser()
699 maildir_root = "RSSMaildir"
702 if options.statedir != None:
703 state_dir = options.statedir
705 mode = os.stat(state_dir)[stat.ST_MODE]
706 if not stat.S_ISDIR(mode):
708 "State directory (%s) is not a directory\n" %(state_dir))
711 # try to make the directory
715 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
717 elif scp.has_option("general", "state_dir"):
718 new_state_dir = scp.get("general", "state_dir")
720 mode = os.stat(state_dir)[stat.ST_MODE]
721 if not stat.S_ISDIR(mode):
723 "State directory (%s) is not a directory\n" %(state_dir))
728 os.mkdir(new_state_dir)
729 state_dir = new_state_dir
732 "Couldn't create state directory %s\n" %(new_state_dir))
736 mode = os.stat(state_dir)[stat.ST_MODE]
737 if not stat.S_ISDIR(mode):
739 "State directory %s is not a directory\n" %(state_dir))
746 "State directory %s could not be created\n" %(state_dir))
749 if scp.has_option("general", "maildir_root"):
750 maildir_root = scp.get("general", "maildir_root")
753 mode = os.stat(maildir_root)[stat.ST_MODE]
754 if not stat.S_ISDIR(mode):
756 "Maildir Root %s is not a directory\n" \
761 os.mkdir(maildir_root)
763 sys.stderr.write("Couldn't create Maildir Root %s\n" \
767 feeds = scp.sections()
769 feeds.remove("general")
773 for section in feeds:
774 # check if the directory exists
777 maildir = scp.get(section, "maildir")
781 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
782 maildir = os.path.join(maildir_root, maildir)
785 exists = os.stat(maildir)
786 if stat.S_ISDIR(exists[stat.ST_MODE]):
787 # check if there's a new, cur and tmp directory
789 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
791 os.mkdir(os.path.join(maildir, "cur"))
792 if not stat.S_ISDIR(mode):
793 sys.stderr.write("Broken maildir: %s\n" %(maildir))
795 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
797 os.mkdir(os.path.join(maildir, "tmp"))
798 if not stat.S_ISDIR(mode):
799 sys.stderr.write("Broken maildir: %s\n" %(maildir))
801 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
802 if not stat.S_ISDIR(mode):
803 sys.stderr.write("Broken maildir: %s\n" %(maildir))
805 os.mkdir(os.path.join(maildir, "new"))
807 sys.stderr.write("Broken maildir: %s\n" %(maildir))
812 sys.stderr.write("Couldn't create root maildir %s\n" \
816 os.mkdir(os.path.join(maildir, "new"))
817 os.mkdir(os.path.join(maildir, "cur"))
818 os.mkdir(os.path.join(maildir, "tmp"))
821 "Couldn't create required maildir directories for %s\n" \
825 # right - we've got the directories, we've got the section, we know the
828 parse_and_deliver(maildir, section, state_dir)