4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
174 def __init__(self,textwidth=70):
177 self.textwidth = textwidth
180 self.ignorenodata = False
184 HTMLParser.__init__(self)
186 def handle_starttag(self, tag, attrs):
187 tag_name = tag.lower()
188 if tag_name in self.blockleveltags:
189 # handle starting a new block - unless we're in a block element
190 # that can contain other blocks, we'll assume that we want to close
192 if len(self.opentags) > 1 and self.opentags[-1] == u'li':
193 self.handle_curdata()
195 if tag_name == u'ol':
196 self.handle_curdata()
197 self.listcount.append(1)
198 self.listlevel = len(self.listcount) - 1
200 if tag_name in self.liststarttags:
201 smallist = self.opentags[-3:-1]
203 for prev_listtag in smallist:
204 if prev_listtag in [u'dl', u'ol']:
205 self.indentlevel = self.indentlevel + 4
207 elif prev_listtag == u'ul':
208 self.indentlevel = self.indentlevel + 3
211 if len(self.opentags) > 0:
212 self.handle_curdata()
213 if tag_name not in self.cancontainflow:
215 self.opentags.append(tag_name)
217 if tag_name == "span":
221 listcount = self.listcount[-1]
225 if tag_name == u'dd' and len(self.opentags) > 1 \
226 and self.opentags[-1] == u'dt':
227 self.handle_curdata()
229 elif tag_name == u'dt' and len(self.opentags) > 1 \
230 and self.opentags[-1] == u'dd':
231 self.handle_curdata()
233 elif tag_name == u'a':
235 if attr[0].lower() == u'href':
236 self.urls.append(attr[1].decode('utf-8'))
237 self.curdata = self.curdata + u'`'
238 self.opentags.append(tag_name)
240 elif tag_name == u'img':
241 self.handle_image(attrs)
243 elif tag_name == u'br':
247 # we don't know the tag, so lets avoid handling it!
250 def handle_startendtag(self, tag, attrs):
251 if tag.lower() == u'br':
253 elif tag.lower() == u'img':
254 self.handle_image(attrs)
258 self.handle_curdata()
259 self.opentags.append(u'br')
260 self.handle_curdata()
263 def handle_image(self, attrs):
268 alt = attr[1].decode('utf-8')
269 elif attr[0] == 'src':
270 url = attr[1].decode('utf-8')
273 if self.images.has_key(alt):
274 if self.images[alt]["url"] == url:
275 self.curdata = self.curdata \
278 while self.images.has_key(alt):
280 self.images[alt]["url"] = url
281 self.curdata = self.curdata \
284 self.images[alt] = {}
285 self.images[alt]["url"] = url
286 self.curdata = self.curdata \
289 if self.images.has_key(url):
290 self.curdata = self.curdata \
293 self.images[url] = {}
294 self.images[url]["url"] =url
295 self.curdata = self.curdata \
298 def handle_curdata(self):
300 if len(self.opentags) == 0:
303 tag_thats_done = self.opentags[-1]
305 if len(self.curdata) == 0:
308 if tag_thats_done == u'br':
309 if len(self.text) == 0 or self.text[-1] != '\n':
310 self.text = self.text + '\n'
311 self.ignorenodata = True
314 if len(self.curdata.strip()) == 0:
317 if tag_thats_done in self.blockleveltags:
318 newlinerequired = self.text != u''
319 if self.ignorenodata:
320 newlinerequired = False
321 self.ignorenodata = False
323 if tag_thats_done in [u'dt', u'dd', u'li'] \
324 and len(self.text) > 1 \
325 and self.text[-1] != u'\n':
326 self.text = self.text + u'\n'
327 elif len(self.text) > 2 \
328 and self.text[-1] != u'\n' \
329 and self.text[-2] != u'\n':
330 self.text = self.text + u'\n\n'
332 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
335 headingtext = " ".join(self.curdata.split())
336 seperator = u'\n' + u' '*self.indentlevel
337 headingtext = seperator.join( \
340 self.textwidth - self.indentlevel \
344 if tag_thats_done == u'h2':
346 elif tag_thats_done != u'h1':
349 if u'\n' in headingtext:
350 underline = u' ' * self.indentlevel \
351 + underlinechar * (self.textwidth - self.indentlevel)
353 underline = u' ' * self.indentlevel \
354 + underlinechar * len(headingtext)
355 self.text = self.text \
356 + headingtext + u'\n' \
358 elif tag_thats_done in [u'p', u'div']:
359 paragraph = unicode( \
360 " ".join(self.curdata.strip().encode("utf-8").split()), \
362 seperator = u'\n' + u' ' * self.indentlevel
363 self.text = self.text \
364 + u' ' * self.indentlevel \
367 paragraph, self.textwidth - self.indentlevel))
368 elif tag_thats_done == "pre":
369 self.text = self.text + unicode( \
370 self.curdata.encode("utf-8"), "utf-8")
371 elif tag_thats_done == u'blockquote':
373 " ".join(self.curdata.encode("utf-8").strip().split()), \
375 seperator = u'\n' + u' ' * self.indentlevel + u'> '
376 if len(self.text) > 0 and self.text[-1] != u'\n':
377 self.text = self.text + u'\n'
378 self.text = self.text \
383 self.textwidth - self.indentlevel - 2 \
387 elif tag_thats_done == "li":
388 item = unicode(self.curdata.encode("utf-8").strip(), "utf-8")
389 if len(self.text) > 0 and self.text[-1] != u'\n':
390 self.text = self.text + u'\n'
391 # work out if we're in an ol rather than a ul
392 latesttags = self.opentags[-4:]
395 for thing in latesttags:
409 listmarker = u' %2d. ' %(self.listcount[-1])
410 self.listcount[-1] = self.listcount[-1] + 1
413 + u' ' * self.indentlevel \
415 self.text = self.text \
416 + u' ' * self.indentlevel \
421 self.textwidth - self.indentlevel - listindent \
425 elif tag_thats_done == u'dt':
426 definition = unicode(" ".join( \
427 self.curdata.encode("utf-8").strip().split()), \
429 if len(self.text) > 0 and self.text[-1] != u'\n':
430 self.text = self.text + u'\n\n'
431 elif len(self.text) > 1 and self.text[-2] != u'\n':
432 self.text = self.text + u'\n'
433 definition = u' ' * self.indentlevel + definition + "::"
434 indentstring = u'\n' + u' ' * (self.indentlevel + 1)
435 self.text = self.text \
437 textwrap.wrap(definition, \
438 self.textwidth - self.indentlevel - 1))
440 elif tag_thats_done == u'dd':
441 definition = unicode(" ".join( \
442 self.curdata.encode("utf-8").strip().split()),
444 if len(definition) > 0:
445 if len(self.text) > 0 and self.text[-1] != u'\n':
446 self.text = self.text + u'\n'
447 indentstring = u'\n' + u' ' * (self.indentlevel + 4)
448 self.text = self.text \
449 + u' ' * (self.indentlevel + 4) \
450 + indentstring.join( \
453 self.textwidth - self.indentlevel - 4 \
457 elif tag_thats_done == u'a':
458 self.curdata = self.curdata + u'`__'
460 elif tag_thats_done in self.liststarttags:
463 if tag_thats_done in self.blockleveltags:
466 self.ignorenodata = False
468 def handle_endtag(self, tag):
469 self.ignorenodata = False
474 tagindex = self.opentags.index(tag)
479 if tag in [u'br', u'img']:
482 if tag in self.liststarttags:
483 if tag in [u'ol', u'dl', u'ul']:
484 self.handle_curdata()
485 # find if there was a previous list level
486 smalllist = self.opentags[:-1]
488 for prev_listtag in smalllist:
489 if prev_listtag in [u'ol', u'dl']:
490 self.indentlevel = self.indentlevel - 4
492 elif prev_listtag == u'ul':
493 self.indentlevel = self.indentlevel - 3
497 self.listcount = self.listcount[:-1]
499 while tagindex < len(self.opentags) \
500 and tag in self.opentags[tagindex+1:]:
502 tagindex = self.opentags.index(tag, tagindex+1)
504 # well, we don't want to do that then
506 if tagindex != len(self.opentags) - 1:
507 # Assuming the data was for the last opened tag first
508 self.handle_curdata()
509 # Now kill the list to be a slice before this tag was opened
510 self.opentags = self.opentags[:tagindex + 1]
512 self.handle_curdata()
513 if self.opentags[-1] == tag:
516 def handle_data(self, data):
517 if len(self.opentags) == 0:
518 self.opentags.append(u'p')
519 self.curdata = self.curdata + data.decode("utf-8")
521 def handle_entityref(self, name):
523 if HTML2Text.entities.has_key(name.lower()):
524 entity = HTML2Text.entities[name.lower()]
526 entity = unichr(int(name[1:]))
528 entity = "&" + name + ";"
530 self.curdata = self.curdata + unicode(entity, "utf-8")
533 self.handle_curdata()
534 if len(self.text) == 0 or self.text[-1] != u'\n':
535 self.text = self.text + u'\n'
537 if len(self.text) > 0:
538 while len(self.text) > 1 and self.text[-1] == u'\n':
539 self.text = self.text[:-1]
540 self.text = self.text + u'\n'
541 if len(self.urls) > 0:
542 self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n'
544 if len(self.images.keys()) > 0:
545 self.text = self.text + u'\n.. ' \
547 ["|%s| image:: %s" %(a, self.images[a]["url"]) \
548 for a in self.images.keys()]) + u'\n'
552 def open_url(method, url):
554 while redirectcount < 3:
555 (type, rest) = urllib.splittype(url)
556 (host, path) = urllib.splithost(rest)
557 (host, port) = urllib.splitport(host)
561 conn = httplib.HTTPConnection("%s:%s" %(host, port))
562 conn.request(method, path)
563 response = conn.getresponse()
564 if response.status in [301, 302, 303, 307]:
565 headers = response.getheaders()
566 for header in headers:
567 if header[0] == "location":
569 elif response.status == 200:
573 redirectcount = redirectcount + 1
576 def parse_and_deliver(maildir, url, statedir):
579 # first check if we know about this feed already
580 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
581 if feeddb.has_key(url):
583 data = cgi.parse_qs(data)
584 response = open_url("HEAD", url)
587 headers = response.getheaders()
590 for header in headers:
591 if header[0] == "content-length":
592 if header[1] != data["content-length"][0]:
594 elif header[0] == "etag":
595 if header[1] != data["etag"][0]:
597 elif header[0] == "last-modified":
598 if header[1] != data["last-modified"][0]:
600 elif header[0] == "content-md5":
601 if header[1] != data["content-md5"][0]:
606 response = open_url("GET", url)
608 headers = response.getheaders()
609 feedhandle = response
611 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
614 return # don't need to do anything, nothings changed.
616 response = open_url("GET", url)
618 headers = response.getheaders()
619 feedhandle = response
621 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
624 fp = feedparser.parse(feedhandle)
625 db = dbm.open(os.path.join(statedir, "seen"), "c")
626 for item in fp["items"]:
627 # have we seen it before?
628 # need to work out what the content is first...
630 if item.has_key("content"):
631 content = item["content"][0]["value"]
633 content = item["summary"]
635 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
639 # check if there's a guid too - if that exists and we match the md5,
641 if item.has_key("guid"):
642 if db.has_key(url + "|" + item["guid"]):
643 data = db[url + "|" + item["guid"]]
644 data = cgi.parse_qs(data)
645 if data["contentmd5"][0] == md5sum:
648 if db.has_key(url + "|" + item["link"]):
649 data = db[url + "|" + item["link"]]
650 data = cgi.parse_qs(data)
651 if data.has_key("message-id"):
652 prevmessageid = data["message-id"][0]
653 if data["contentmd5"][0] == md5sum:
657 author = item["author"]
661 # create a basic email message
662 msg = MIMEMultipart("alternative")
664 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
668 string.ascii_letters + string.digits \
669 ) for a in range(0,6) \
670 ]) + "@" + socket.gethostname() + ">"
671 msg.add_header("Message-ID", messageid)
672 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
673 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
674 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
676 msg.add_header("References", prevmessageid)
677 createddate = datetime.datetime.now() \
678 .strftime("%a, %e %b %Y %T -0000")
680 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
681 .strftime("%a, %e %b %Y %T -0000")
684 msg.add_header("Date", createddate)
685 msg.add_header("Subject", item["title"])
686 msg.set_default_type("text/plain")
688 htmlcontent = content.encode("utf-8")
689 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
693 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
694 textparser = HTML2Text()
695 textparser.feed(content.encode("utf-8"))
696 textcontent = textparser.gettext()
697 textcontent = "%s\n\nItem URL: %s" %( \
700 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
704 # start by working out the filename we should be writting to, we do
705 # this following the normal maildir style rules
706 fname = str(os.getpid()) \
707 + "." + socket.gethostname() \
710 string.ascii_letters + string.digits \
711 ) for a in range(0,10) \
713 + datetime.datetime.now().strftime('%s')
714 fn = os.path.join(maildir, "tmp", fname)
716 fh.write(msg.as_string())
718 # now move it in to the new directory
719 newfn = os.path.join(maildir, "new", fname)
723 # now add to the database about the item
725 messageid = prevmessageid + " " + messageid
726 if item.has_key("guid") and item["guid"] != item["link"]:
727 data = urllib.urlencode(( \
728 ("message-id", messageid), \
729 ("created", createddate), \
730 ("contentmd5", md5sum) \
732 db[url + "|" + item["guid"]] = data
734 data = db[url + "|" + item["link"]]
735 data = cgi.parse_qs(data)
736 newdata = urllib.urlencode(( \
737 ("message-id", messageid), \
738 ("created", data["created"][0]), \
739 ("contentmd5", data["contentmd5"][0]) \
741 db[url + "|" + item["link"]] = newdata
743 db[url + "|" + item["link"]] = data
745 data = urllib.urlencode(( \
746 ("message-id", messageid), \
747 ("created", createddate), \
748 ("contentmd5", md5sum) \
750 db[url + "|" + item["link"]] = data
754 for header in headers:
756 ["content-md5", "etag", "last-modified", "content-length"]:
757 data.append((header[0], header[1]))
759 data = urllib.urlencode(data)
765 if __name__ == "__main__":
766 # This only gets executed if we really called the program
767 # first off, parse the command line arguments
769 oparser = OptionParser()
771 "-c", "--conf", dest="conf",
772 help="location of config file"
775 "-s", "--statedir", dest="statedir",
776 help="location of directory to store state in"
779 (options, args) = oparser.parse_args()
781 # check for the configfile
785 if options.conf != None:
786 # does the file exist?
788 os.stat(options.conf)
789 configfile = options.conf
791 # should exit here as the specified file doesn't exist
793 "Config file %s does not exist. Exiting.\n" %(options.conf,))
796 # check through the default locations
798 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
799 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
802 os.stat("/etc/rss2maildir.conf")
803 configfile = "/etc/rss2maildir.conf"
805 sys.stderr.write("No config file found. Exiting.\n")
808 # Right - if we've got this far, we've got a config file, now for the hard
811 scp = SafeConfigParser()
814 maildir_root = "RSSMaildir"
817 if options.statedir != None:
818 state_dir = options.statedir
820 mode = os.stat(state_dir)[stat.ST_MODE]
821 if not stat.S_ISDIR(mode):
823 "State directory (%s) is not a directory\n" %(state_dir))
826 # try to make the directory
830 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
832 elif scp.has_option("general", "state_dir"):
833 new_state_dir = scp.get("general", "state_dir")
835 mode = os.stat(new_state_dir)[stat.ST_MODE]
836 if not stat.S_ISDIR(mode):
838 "State directory (%s) is not a directory\n" %(state_dir))
841 state_dir = new_state_dir
845 os.mkdir(new_state_dir)
846 state_dir = new_state_dir
849 "Couldn't create state directory %s\n" %(new_state_dir))
853 mode = os.stat(state_dir)[stat.ST_MODE]
854 if not stat.S_ISDIR(mode):
856 "State directory %s is not a directory\n" %(state_dir))
863 "State directory %s could not be created\n" %(state_dir))
866 if scp.has_option("general", "maildir_root"):
867 maildir_root = scp.get("general", "maildir_root")
870 mode = os.stat(maildir_root)[stat.ST_MODE]
871 if not stat.S_ISDIR(mode):
873 "Maildir Root %s is not a directory\n" \
878 os.mkdir(maildir_root)
880 sys.stderr.write("Couldn't create Maildir Root %s\n" \
884 feeds = scp.sections()
886 feeds.remove("general")
890 for section in feeds:
891 # check if the directory exists
894 maildir = scp.get(section, "maildir")
898 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
899 maildir = os.path.join(maildir_root, maildir)
902 exists = os.stat(maildir)
903 if stat.S_ISDIR(exists[stat.ST_MODE]):
904 # check if there's a new, cur and tmp directory
906 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
908 os.mkdir(os.path.join(maildir, "cur"))
909 if not stat.S_ISDIR(mode):
910 sys.stderr.write("Broken maildir: %s\n" %(maildir))
912 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
914 os.mkdir(os.path.join(maildir, "tmp"))
915 if not stat.S_ISDIR(mode):
916 sys.stderr.write("Broken maildir: %s\n" %(maildir))
918 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
919 if not stat.S_ISDIR(mode):
920 sys.stderr.write("Broken maildir: %s\n" %(maildir))
922 os.mkdir(os.path.join(maildir, "new"))
924 sys.stderr.write("Broken maildir: %s\n" %(maildir))
929 sys.stderr.write("Couldn't create root maildir %s\n" \
933 os.mkdir(os.path.join(maildir, "new"))
934 os.mkdir(os.path.join(maildir, "cur"))
935 os.mkdir(os.path.join(maildir, "tmp"))
938 "Couldn't create required maildir directories for %s\n" \
942 # right - we've got the directories, we've got the section, we know the
945 parse_and_deliver(maildir, section, state_dir)