4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
93 def __init__(self,textwidth=70):
96 self.textwidth = textwidth
99 self.ignorenodata = False
102 HTMLParser.__init__(self)
104 def handle_starttag(self, tag, attrs):
105 tag_name = tag.lower()
106 if tag_name in self.blockleveltags:
107 # handle starting a new block - unless we're in a block element
108 # that can contain other blocks, we'll assume that we want to close
110 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
111 self.handle_curdata()
113 if tag_name == u'ol':
114 self.handle_curdata()
115 self.listcount.append(1)
116 self.listlevel = len(self.listcount) - 1
118 if tag_name in self.liststarttags:
119 smallist = self.opentags[-3:-1]
121 for prev_listtag in smallist:
122 if prev_listtag in [u'dl', u'ol']:
123 self.indentlevel = self.indentlevel + 4
125 elif prev_listtag == u'ul':
126 self.indentlevel = self.indentlevel + 3
129 if len(self.opentags) > 0:
130 self.handle_curdata()
131 if tag_name not in self.cancontainflow:
133 self.opentags.append(tag_name)
135 if tag_name == "span":
139 listcount = self.listcount[-1]
143 if tag_name == u'dd' and len(self.opentags) > 1 \
144 and self.opentags[-1] == u'dt':
145 self.handle_curdata()
147 elif tag_name == u'dt' and len(self.opentags) > 1 \
148 and self.opentags[-1] == u'dd':
149 self.handle_curdata()
151 elif tag_name == u'a':
153 if attr[0].lower() == u'href':
154 self.urls.append(attr[1].decode('utf-8'))
155 self.curdata = self.curdata + u'`'
156 self.opentags.append(tag_name)
158 elif tag_name == u'img':
159 self.handle_image(attrs)
161 elif tag_name == u'br':
165 # we don't know the tag, so lets avoid handling it!
168 def handle_startendtag(self, tag, attrs):
169 if tag.lower() == u'br':
171 elif tag.lower() == u'img':
172 self.handle_image(attrs)
176 self.handle_curdata()
177 self.opentags.append(u'br')
178 self.handle_curdata()
181 def handle_image(self, attrs):
186 alt = attr[1].decode('utf-8')
187 elif attr[0] == 'src':
188 url = attr[1].decode('utf-8')
190 self.curdata = self.curdata \
194 self.curdata = self.curdata \
198 self.curdata = self.curdata \
201 def handle_curdata(self):
203 if len(self.opentags) == 0:
206 tag_thats_done = self.opentags[-1]
208 if len(self.curdata) == 0:
211 if tag_thats_done == u'br':
212 if len(self.text) == 0 or self.text[-1] != '\n':
213 self.text = self.text + '\n'
214 self.ignorenodata = True
217 if len(self.curdata.strip()) == 0:
220 if tag_thats_done in self.blockleveltags:
221 newlinerequired = self.text != u''
222 if self.ignorenodata:
223 newlinerequired = False
224 self.ignorenodata = False
226 if tag_thats_done in [u'dt', u'dd', u'li'] \
227 and len(self.text) > 1 \
228 and self.text[-1] != u'\n':
229 self.text = self.text + u'\n'
230 elif len(self.text) > 2 \
231 and self.text[-1] != u'\n' \
232 and self.text[-2] != u'\n':
233 self.text = self.text + u'\n\n'
235 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
238 headingtext = " ".join(self.curdata.split())
239 seperator = u'\n' + u' '*self.indentlevel
240 headingtext = seperator.join( \
243 self.textwidth - self.indentlevel \
247 if tag_thats_done == u'h2':
249 elif tag_thats_done != u'h1':
252 if u'\n' in headingtext:
253 underline = u' ' * self.indentlevel \
254 + underlinechar * (self.textwidth - self.indentlevel)
256 underline = u' ' * self.indentlevel \
257 + underlinechar * len(headingtext)
258 self.text = self.text \
259 + headingtext + u'\n' \
261 elif tag_thats_done in [u'p', u'div']:
262 paragraph = unicode( \
263 " ".join(self.curdata.strip().encode("utf-8").split()), \
265 seperator = u'\n' + u' ' * self.indentlevel
266 self.text = self.text \
267 + u' ' * self.indentlevel \
270 paragraph, self.textwidth - self.indentlevel))
271 elif tag_thats_done == "pre":
272 self.text = self.text + unicode( \
273 " ".join(self.curdata.encode("utf-8").split()), "utf-8")
274 elif tag_thats_done == u'blockquote':
276 " ".join(self.curdata.encode("utf-8").strip().split()), \
278 seperator = u'\n' + u' ' * self.indentlevel + u'> '
279 if len(self.text) > 0 and self.text[-1] != u'\n':
280 self.text = self.text + u'\n'
281 self.text = self.text \
286 self.textwidth - self.indentlevel - 2 \
290 elif tag_thats_done == "li":
291 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
292 if len(self.text) > 0 and self.text[-1] != u'\n':
293 self.text = self.text + u'\n'
294 # work out if we're in an ol rather than a ul
295 latesttags = self.opentags[-4:]
298 for thing in latesttags:
312 listmarker = u' %2d. ' %(self.listcount[-1])
313 self.listcount[-1] = self.listcount[-1] + 1
316 + u' ' * self.indentlevel \
318 self.text = self.text \
319 + u' ' * self.indentlevel \
324 self.textwidth - self.indentlevel - listindent \
328 elif tag_thats_done == u'dt':
329 definition = unicode(" ".join( \
330 self.curdata.encode("utf-8").strip().split()), \
332 if len(self.text) > 0 and self.text[-1] != u'\n':
333 self.text = self.text + u'\n\n'
334 elif len(self.text) > 1 and self.text[-2] != u'\n':
335 self.text = self.text + u'\n'
336 definition = u' ' * self.indentlevel + definition + "::"
337 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
338 self.text = self.text \
340 textwrap.wrap(definition, \
341 self.textwidth - self.indentlevel - 1))
343 elif tag_thats_done == u'dd':
344 definition = unicode(" ".join( \
345 self.curdata.encode("utf-8").strip().split()),
347 if len(definition) > 0:
348 if len(self.text) > 0 and self.text[-1] != u'\n':
349 self.text = self.text + u'\n'
350 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
351 self.text = self.text \
352 + u' ' * (self.indentlevel + 4) \
353 + indentstring.join( \
356 self.textwidth - self.indentlevel - 4 \
360 elif tag_thats_done == u'a':
361 self.curdata = self.curdata + u'`__'
363 elif tag_thats_done in self.liststarttags:
366 if tag_thats_done in self.blockleveltags:
369 self.ignorenodata = False
371 def handle_endtag(self, tag):
372 self.ignorenodata = False
377 tagindex = self.opentags.index(tag)
382 if tag in [u'br', u'img']:
385 if tag in self.liststarttags:
386 if tag in [u'ol', u'dl', u'ul']:
387 self.handle_curdata()
388 # find if there was a previous list level
389 smalllist = self.opentags[:-1]
391 for prev_listtag in smalllist:
392 if prev_listtag in [u'ol', u'dl']:
393 self.indentlevel = self.indentlevel - 4
395 elif prev_listtag == u'ul':
396 self.indentlevel = self.indentlevel - 3
400 self.listcount = self.listcount[:-1]
402 while tagindex < len(self.opentags) \
403 and tag in self.opentags[tagindex+1:]:
405 tagindex = self.opentags.index(tag, tagindex+1)
407 # well, we don't want to do that then
409 if tagindex != len(self.opentags) - 1:
410 # Assuming the data was for the last opened tag first
411 self.handle_curdata()
412 # Now kill the list to be a slice before this tag was opened
413 self.opentags = self.opentags[:tagindex + 1]
415 self.handle_curdata()
416 if self.opentags[-1] == tag:
419 def handle_data(self, data):
420 if len(self.opentags) == 0:
421 self.opentags.append(u'p')
422 self.curdata = self.curdata + data.decode("utf-8")
424 def handle_entityref(self, name):
426 if HTML2Text.entities.has_key(name.lower()):
427 entity = HTML2Text.entities[name.lower()]
429 entity = unichr(int(name[1:]))
431 entity = "&" + name + ";"
433 self.curdata = self.curdata + unicode(entity, "utf-8")
436 self.handle_curdata()
437 if len(self.text) == 0 or self.text[-1] != u'\n':
438 self.text = self.text + u'\n'
440 if len(self.text) > 0:
441 while len(self.text) > 1 and self.text[-1] == u'\n':
442 self.text = self.text[:-1]
443 self.text = self.text + u'\n'
444 if len(self.urls) > 0:
445 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
449 def open_url(method, url):
451 while redirectcount < 3:
452 (type, rest) = urllib.splittype(url)
453 (host, path) = urllib.splithost(rest)
454 (host, port) = urllib.splitport(host)
458 conn = httplib.HTTPConnection("%s:%s" %(host, port))
459 conn.request(method, path)
460 response = conn.getresponse()
461 if response.status in [301, 302, 303, 307]:
462 headers = response.getheaders()
463 for header in headers:
464 if header[0] == "location":
466 elif response.status == 200:
470 redirectcount = redirectcount + 1
473 def parse_and_deliver(maildir, url, statedir):
476 # first check if we know about this feed already
477 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
478 if feeddb.has_key(url):
480 data = cgi.parse_qs(data)
481 response = open_url("HEAD", url)
484 headers = response.getheaders()
487 for header in headers:
488 if header[0] == "content-length":
489 if header[1] != data["content-length"][0]:
491 elif header[0] == "etag":
492 if header[1] != data["etag"][0]:
494 elif header[0] == "last-modified":
495 if header[1] != data["last-modified"][0]:
497 elif header[0] == "content-md5":
498 if header[1] != data["content-md5"][0]:
503 response = open_url("GET", url)
505 headers = response.getheaders()
506 feedhandle = response
508 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
511 return # don't need to do anything, nothings changed.
513 response = open_url("GET", url)
515 headers = response.getheaders()
516 feedhandle = response
518 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
521 fp = feedparser.parse(feedhandle)
522 db = dbm.open(os.path.join(statedir, "seen"), "c")
523 for item in fp["items"]:
524 # have we seen it before?
525 # need to work out what the content is first...
527 if item.has_key("content"):
528 content = item["content"][0]["value"]
530 content = item["summary"]
532 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
536 # check if there's a guid too - if that exists and we match the md5,
538 if item.has_key("guid"):
539 if db.has_key(url + "|" + item["guid"]):
540 data = db[url + "|" + item["guid"]]
541 data = cgi.parse_qs(data)
542 if data["contentmd5"][0] == md5sum:
545 if db.has_key(url + "|" + item["link"]):
546 data = db[url + "|" + item["link"]]
547 data = cgi.parse_qs(data)
548 if data.has_key("message-id"):
549 prevmessageid = data["message-id"][0]
550 if data["contentmd5"][0] == md5sum:
554 author = item["author"]
558 # create a basic email message
559 msg = MIMEMultipart("alternative")
561 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
565 string.ascii_letters + string.digits \
566 ) for a in range(0,6) \
567 ]) + "@" + socket.gethostname() + ">"
568 msg.add_header("Message-ID", messageid)
569 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
570 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
571 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
573 msg.add_header("References", prevmessageid)
574 createddate = datetime.datetime.now() \
575 .strftime("%a, %e %b %Y %T -0000")
577 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
578 .strftime("%a, %e %b %Y %T -0000")
581 msg.add_header("Date", createddate)
582 msg.add_header("Subject", item["title"])
583 msg.set_default_type("text/plain")
585 htmlcontent = content.encode("utf-8")
586 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
590 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
591 textparser = HTML2Text()
592 textparser.feed(content.encode("utf-8"))
593 textcontent = textparser.gettext()
594 textcontent = "%s\n\nItem URL: %s" %( \
597 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
601 # start by working out the filename we should be writting to, we do
602 # this following the normal maildir style rules
603 fname = str(os.getpid()) \
604 + "." + socket.gethostname() \
607 string.ascii_letters + string.digits \
608 ) for a in range(0,10) \
610 + datetime.datetime.now().strftime('%s')
611 fn = os.path.join(maildir, "tmp", fname)
613 fh.write(msg.as_string())
615 # now move it in to the new directory
616 newfn = os.path.join(maildir, "new", fname)
620 # now add to the database about the item
622 messageid = prevmessageid + " " + messageid
623 if item.has_key("guid") and item["guid"] != item["link"]:
624 data = urllib.urlencode(( \
625 ("message-id", messageid), \
626 ("created", createddate), \
627 ("contentmd5", md5sum) \
629 db[url + "|" + item["guid"]] = data
631 data = db[url + "|" + item["link"]]
632 data = cgi.parse_qs(data)
633 newdata = urllib.urlencode(( \
634 ("message-id", messageid), \
635 ("created", data["created"][0]), \
636 ("contentmd5", data["contentmd5"][0]) \
638 db[url + "|" + item["link"]] = newdata
640 db[url + "|" + item["link"]] = data
642 data = urllib.urlencode(( \
643 ("message-id", messageid), \
644 ("created", createddate), \
645 ("contentmd5", md5sum) \
647 db[url + "|" + item["link"]] = data
651 for header in headers:
653 ["content-md5", "etag", "last-modified", "content-length"]:
654 data.append((header[0], header[1]))
656 data = urllib.urlencode(data)
662 if __name__ == "__main__":
663 # This only gets executed if we really called the program
664 # first off, parse the command line arguments
666 oparser = OptionParser()
668 "-c", "--conf", dest="conf",
669 help="location of config file"
672 "-s", "--statedir", dest="statedir",
673 help="location of directory to store state in"
676 (options, args) = oparser.parse_args()
678 # check for the configfile
682 if options.conf != None:
683 # does the file exist?
685 os.stat(options.conf)
686 configfile = options.conf
688 # should exit here as the specified file doesn't exist
690 "Config file %s does not exist. Exiting.\n" %(options.conf,))
693 # check through the default locations
695 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
696 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
699 os.stat("/etc/rss2maildir.conf")
700 configfile = "/etc/rss2maildir.conf"
702 sys.stderr.write("No config file found. Exiting.\n")
705 # Right - if we've got this far, we've got a config file, now for the hard
708 scp = SafeConfigParser()
711 maildir_root = "RSSMaildir"
714 if options.statedir != None:
715 state_dir = options.statedir
717 mode = os.stat(state_dir)[stat.ST_MODE]
718 if not stat.S_ISDIR(mode):
720 "State directory (%s) is not a directory\n" %(state_dir))
723 # try to make the directory
727 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
729 elif scp.has_option("general", "state_dir"):
730 new_state_dir = scp.get("general", "state_dir")
732 mode = os.stat(new_state_dir)[stat.ST_MODE]
733 if not stat.S_ISDIR(mode):
735 "State directory (%s) is not a directory\n" %(state_dir))
738 state_dir = new_state_dir
742 os.mkdir(new_state_dir)
743 state_dir = new_state_dir
746 "Couldn't create state directory %s\n" %(new_state_dir))
750 mode = os.stat(state_dir)[stat.ST_MODE]
751 if not stat.S_ISDIR(mode):
753 "State directory %s is not a directory\n" %(state_dir))
760 "State directory %s could not be created\n" %(state_dir))
763 if scp.has_option("general", "maildir_root"):
764 maildir_root = scp.get("general", "maildir_root")
767 mode = os.stat(maildir_root)[stat.ST_MODE]
768 if not stat.S_ISDIR(mode):
770 "Maildir Root %s is not a directory\n" \
775 os.mkdir(maildir_root)
777 sys.stderr.write("Couldn't create Maildir Root %s\n" \
781 feeds = scp.sections()
783 feeds.remove("general")
787 for section in feeds:
788 # check if the directory exists
791 maildir = scp.get(section, "maildir")
795 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
796 maildir = os.path.join(maildir_root, maildir)
799 exists = os.stat(maildir)
800 if stat.S_ISDIR(exists[stat.ST_MODE]):
801 # check if there's a new, cur and tmp directory
803 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
805 os.mkdir(os.path.join(maildir, "cur"))
806 if not stat.S_ISDIR(mode):
807 sys.stderr.write("Broken maildir: %s\n" %(maildir))
809 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
811 os.mkdir(os.path.join(maildir, "tmp"))
812 if not stat.S_ISDIR(mode):
813 sys.stderr.write("Broken maildir: %s\n" %(maildir))
815 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
816 if not stat.S_ISDIR(mode):
817 sys.stderr.write("Broken maildir: %s\n" %(maildir))
819 os.mkdir(os.path.join(maildir, "new"))
821 sys.stderr.write("Broken maildir: %s\n" %(maildir))
826 sys.stderr.write("Couldn't create root maildir %s\n" \
830 os.mkdir(os.path.join(maildir, "new"))
831 os.mkdir(os.path.join(maildir, "cur"))
832 os.mkdir(os.path.join(maildir, "tmp"))
835 "Couldn't create required maildir directories for %s\n" \
839 # right - we've got the directories, we've got the section, we know the
842 parse_and_deliver(maildir, section, state_dir)