X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/738dfe9c90cc9a357a5236a77281be87c90a13e1..84f14a811430b6a49cd2d9adac6d4bcaa34c3037:/rss2maildir.py?ds=sidebyside diff --git a/rss2maildir.py b/rss2maildir.py index 80f22d5..f6078ff 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -58,175 +58,316 @@ class HTML2Text(HTMLParser): "nbsp": " ", } - def __init__(self): - self.inheadingone = False - self.inheadingtwo = False - self.inotherheading = False - self.inparagraph = True - self.inblockquote = False - self.inlink = False + blockleveltags = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "pre", + "p", + "ul", + "ol", + "dl", + "br", + ] + + liststarttags = [ + "ul", + "ol", + "dl", + ] + + cancontainflow = [ + "div", + "li", + "dd", + "blockquote", + ] + + def __init__(self,textwidth=70): self.text = u'' - self.currentparagraph = u'' - self.headingtext = u'' - self.blockquote = u'' - self.inpre = False - self.inul = False - self.initem = False - self.item = u'' + self.curdata = u'' + self.textwidth = textwidth + self.opentags = [] + self.indentlevel = 0 + self.listcount = [] + self.urls = [] HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): - if tag.lower() == "h1": - self.inheadingone = True - self.inparagraph = False - elif tag.lower() == "h2": - self.inheadingtwo = True - self.inparagraph = False - elif tag.lower() in ["h3", "h4", "h5", "h6"]: - self.inotherheading = True - self.inparagraph = False - elif tag.lower() == "a": - self.inlink = True - elif tag.lower() == "br": - self.handle_br() - elif tag.lower() == "blockquote": - self.inblockquote = True - self.text = self.text + u'\n' - elif tag.lower() == "p": - if self.text != "": - self.text = self.text + u'\n\n' - if self.inparagraph: - self.text = self.text \ - + u'\n'.join(textwrap.wrap(self.currentparagraph, 70)) - self.currentparagraph = u'' - self.inparagraph = True - elif tag.lower() == "pre": - self.text = self.text + "\n" - self.inpre = True - self.inparagraph = False - self.inblockquote = False - elif tag.lower() == "ul": - self.item = u'' - self.inul = True - self.text = self.text + "\n" - elif tag.lower() == "li" and self.inul: - if not self.initem: - self.initem = True - self.item = u'' - else: - self.text = self.text \ - + u' * ' \ - + u'\n '.join([a.strip() for a in \ - textwrap.wrap(self.item, 67)]) \ - + u'\n' - self.item = u'' + tag_name = tag.lower() + if tag_name in self.blockleveltags: + # handle starting a new block - unless we're in a block element + # that can contain other blocks, we'll assume that we want to close + # the container + if tag_name == u'br': + self.handle_curdata() + self.opentags.append(tag_name) + self.opentags.pop() + + if len(self.opentags) > 1 and self.opentags[-1] == u'li': + self.handle_curdata() + + if tag_name == u'ol': + self.handle_curdata() + self.listcount.append(1) + self.listlevel = len(self.listcount) - 1 + + if tag_name in self.liststarttags: + smallist = self.opentags[-3:-1] + smallist.reverse() + for prev_listtag in smallist: + if prev_listtag in [u'dl', u'ol']: + self.indentlevel = self.indentlevel + 4 + break + elif prev_listtag == u'ul': + self.indentlevel = self.indentlevel + 3 + break + + if len(self.opentags) > 0: + self.handle_curdata() + if tag_name not in self.cancontainflow: + self.opentags.pop() + self.opentags.append(tag_name) + else: + listcount = 0 + try: + listcount = self.listcount[-1] + except: + pass + + if tag_name == u'dd' and len(self.opentags) > 1 \ + and self.opentags[-1] == u'dt': + self.handle_curdata() + self.opentags.pop() + elif tag_name == u'dt' and len(self.opentags) > 1 \ + and self.opentags[-1] == u'dd': + self.handle_curdata() + self.opentags.pop() + elif tag_name == u'a': + for attr in attrs: + if attr[0].lower() == u'href': + self.urls.append(attr[1]) + self.curdata = self.curdata + u'`' + self.opentags.append(tag_name) + return + + self.handle_curdata() + self.opentags.append(tag_name) def handle_startendtag(self, tag, attrs): - if tag.lower() == "br": - self.handle_br() + if tag.lower() == u'br': + self.opentags.append(u'br') + self.handle_curdata() # just handle the data, don't do anything else + self.opentags.pop() - def handle_br(self): - if self.inparagraph: - self.text = self.text \ - + u'\n'.join( \ - [a \ - for a in textwrap.wrap( \ - self.currentparagraph, 70) \ - ] \ - ) \ - + u'\n' - self.currentparagraph = u'' - elif self.inblockquote: - self.text = self.text \ - + u'\n> ' \ - + u'\n> '.join( \ - [a \ - for a in textwrap.wrap( \ - self.blockquote.encode("utf-8") \ - , 68) \ - ] \ + def handle_curdata(self): + if len(self.opentags) == 0: + return + + if len(self.curdata) == 0: + return + + if len(self.curdata.strip()) == 0: + return + + tag_thats_done = self.opentags[-1] + + if tag_thats_done in self.blockleveltags: + newlinerequired = self.text != u'' + if newlinerequired: + if newlinerequired \ + and len(self.text) > 2 \ + and self.text[-1] != u'\n' \ + and self.text[-2] != u'\n': + self.text = self.text + u'\n\n' + + if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]: + underline = u'' + underlinechar = u'=' + headingtext = unicode( \ + self.curdata.encode("utf-8").strip(), "utf-8") + seperator = u'\n' + u' '*self.indentlevel + headingtext = seperator.join( \ + textwrap.wrap( \ + headingtext, \ + self.textwidth - self.indentlevel \ ) \ - + u'\n' - self.blockquote = u'' - else: - self.text = self.text + "\n" + ) - def handle_endtag(self, tag): - if tag.lower() == "h1": - self.inheadingone = False - self.text = self.text \ - + u'\n\n' \ - + self.headingtext.encode("utf-8") \ - + u'\n' \ - + u'=' * len(self.headingtext.encode("utf-8").strip()) - self.headingtext = u'' - elif tag.lower() == "h2": - self.inheadingtwo = False + if tag_thats_done == u'h2': + underlinechar = u'-' + elif tag_thats_done != u'h1': + underlinechar = u'~' + + if u'\n' in headingtext: + underline = u' ' * self.indentlevel \ + + underlinechar * (self.textwidth - self.indentlevel) + else: + underline = u' ' * self.indentlevel \ + + underlinechar * len(headingtext) self.text = self.text \ - + u'\n\n' \ - + self.headingtext.encode("utf-8") \ - + u'\n' \ - + u'-' * len(self.headingtext.encode("utf-8").strip()) - self.headingtext = u'' - elif tag.lower() in ["h3", "h4", "h5", "h6"]: - self.inotherheading = False + + headingtext.encode("utf-8") + u'\n' \ + + underline + elif tag_thats_done == u'p': + paragraph = unicode( \ + self.curdata.strip().encode("utf-8"), "utf-8") + seperator = u'\n' + u' ' * self.indentlevel self.text = self.text \ - + u'\n\n' \ - + self.headingtext.encode("utf-8") \ - + u'\n' \ - + u'~' * len(self.headingtext.encode("utf-8").strip()) - self.headingtext = u'' - elif tag.lower() == "p": + + u' ' * self.indentlevel \ + + seperator.join( \ + textwrap.wrap( \ + paragraph, self.textwidth - self.indentlevel)) + elif tag_thats_done == "pre": + self.text = self.text + unicode( \ + self.curdata.encode("utf-8"), "utf-8") + elif tag_thats_done == "blockquote": + quote = unicode( \ + self.curdata.encode("utf-8").strip(), "utf-8") + seperator = u'\n' + u' ' * self.indentlevel + u'> ' self.text = self.text \ - + u'\n'.join(textwrap.wrap( \ - self.currentparagraph, 70) \ + + u'> ' \ + + seperator.join( \ + textwrap.wrap( \ + quote, \ + self.textwidth - self.indentlevel - 2 \ + ) ) - self.inparagraph = False - self.currentparagraph = u'' - elif tag.lower() == "blockquote": + elif tag_thats_done == "li": + item = unicode(self.curdata.encode("utf-8").strip(), "utf-8") + if len(self.text) > 0 and self.text[-1] != u'\n': + self.text = self.text + u'\n' + # work out if we're in an ol rather than a ul + latesttags = self.opentags[-4:] + latesttags.reverse() + isul = None + for thing in latesttags: + if thing == 'ul': + isul = True + break + elif thing == 'ol': + isul = False + break + + listindent = 3 + if not isul: + listindent = 4 + + listmarker = u' * ' + if isul == False: + listmarker = u' %2d. ' %(self.listcount[-1]) + self.listcount[-1] = self.listcount[-1] + 1 + + seperator = u'\n' \ + + u' ' * self.indentlevel \ + + u' ' * listindent self.text = self.text \ - + u'\n> ' \ - + u'\n> '.join( \ - [a.strip() \ - for a in textwrap.wrap( \ - self.blockquote, 68)] \ + + u' ' * self.indentlevel \ + + listmarker \ + + seperator.join( \ + textwrap.wrap( \ + item, \ + self.textwidth - self.indentlevel - listindent \ ) \ - + u'\n' - self.inblockquote = False - self.blockquote = u'' - elif tag.lower() == "pre": - self.inpre = False - elif tag.lower() == "li": - self.initem = False - if self.item != "": + ) + self.curdata = u'' + elif tag_thats_done == u'dt': + definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8") + if len(self.text) > 0 and self.text[-1] != u'\n': + self.text = self.text + u'\n\n' + elif len(self.text) > 1 and self.text[-2] != u'\n': + self.text = self.text + u'\n' + definition = u' ' * self.indentlevel + definition + "::" + indentstring = u'\n' + u' ' * (self.indentlevel + 1) + self.text = self.text \ + + indentstring.join( + textwrap.wrap(definition, \ + self.textwidth - self.indentlevel - 1)) + self.curdata = u'' + elif tag_thats_done == u'dd': + definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8") + if len(definition) > 0: + if len(self.text) > 0 and self.text[-1] != u'\n': + self.text = self.text + u'\n' + indentstring = u'\n' + u' ' * (self.indentlevel + 4) self.text = self.text \ - + u' * ' \ - + u'\n '.join( \ - [a.strip() for a in textwrap.wrap(self.item, 67)]) \ - + u'\n' - self.item = u'' - elif tag.lower() == "ul": - self.inul = False + + u' ' * (self.indentlevel + 4) \ + + indentstring.join( \ + textwrap.wrap( \ + definition, \ + self.textwidth - self.indentlevel - 4 \ + ) \ + ) + self.curdata = u'' + elif tag_thats_done == u'a': + self.curdata = self.curdata + u'`__' + pass + elif tag_thats_done in self.liststarttags: + pass + else: + # we've got no idea what this tag does, so we'll + # make an assumption that we're not going to know later + if len(self.curdata) > 0: + self.text = self.text \ + + u' ... ' \ + + u'\n ... '.join( \ + textwrap.wrap( \ + unicode( \ + self.curdata.encode("utf-8").strip(), \ + "utf-8"), self.textwidth - 5)) + self.curdata = u'' - def handle_data(self, data): - if self.inheadingone or self.inheadingtwo or self.inotherheading: - self.headingtext = self.headingtext \ - + unicode(data, "utf-8").strip() \ - + u' ' - elif self.inblockquote: - self.blockquote = self.blockquote \ - + unicode(data, "utf-8").strip() \ - + u' ' - elif self.inparagraph: - self.currentparagraph = self.currentparagraph \ - + unicode(data, "utf-8").strip() \ - + u' ' - elif self.inul and self.initem: - self.item = self.item + unicode(data, "utf-8") - elif self.inpre: - self.text = self.text + unicode(data, "utf-8") + if tag_thats_done in self.blockleveltags: + self.curdata = u'' + + def handle_endtag(self, tag): + try: + tagindex = self.opentags.index(tag) + except: + # closing tag we know nothing about. + # err. weird. + tagindex = 0 + + tag = tag.lower() + + if tag in self.liststarttags: + if tag in [u'ol', u'dl', u'ul']: + self.handle_curdata() + # find if there was a previous list level + smalllist = self.opentags[:-1] + smalllist.reverse() + for prev_listtag in smalllist: + if prev_listtag in [u'ol', u'dl']: + self.indentlevel = self.indentlevel - 4 + break + elif prev_listtag == u'ul': + self.indentlevel = self.indentlevel - 3 + break + + if tag == u'ol': + self.listcount = self.listcount[:-1] + + while tagindex < len(self.opentags) \ + and tag in self.opentags[tagindex+1:]: + try: + tagindex = self.opentags.index(tag, tagindex+1) + except: + # well, we don't want to do that then + pass + if tagindex != len(self.opentags) - 1: + # Assuming the data was for the last opened tag first + self.handle_curdata() + # Now kill the list to be a slice before this tag was opened + self.opentags = self.opentags[:tagindex + 1] else: - self.text = self.text + unicode(data, "utf-8").strip() + u' ' + self.handle_curdata() + if self.opentags[-1] == tag: + self.opentags.pop() + + def handle_data(self, data): + self.curdata = self.curdata + unicode(data, "utf-8") def handle_entityref(self, name): entity = name @@ -237,21 +378,21 @@ class HTML2Text(HTMLParser): else: entity = "&" + name + ";" - if self.inparagraph: - self.currentparagraph = self.currentparagraph \ - + unicode(entity, "utf-8") - elif self.inblockquote: - self.blockquote = self.blockquote + unicode(entity, "utf-8") - else: - self.text = self.text + unicode(entity, "utf-8") + self.curdata = self.curdata + unicode(entity, "utf-8") def gettext(self): - data = self.text - if self.inparagraph: - data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70)) - if data[-1] != '\n': - data = data + '\n' - return data + self.handle_curdata() + if len(self.text) == 0 or self.text[-1] != u'\n': + self.text = self.text + u'\n' + self.opentags = [] + if len(self.text) > 0: + while len(self.text) > 1 and self.text[-1] == u'\n': + self.text = self.text[:-1] + self.text = self.text + u'\n' + if len(self.urls) > 0: + self.text = self.text + u'\n__ ' + u'\n__ '.join(self.urls) + u'\n' + self.urls = [] + return self.text def open_url(method, url): redirectcount = 0 @@ -456,7 +597,8 @@ def parse_and_deliver(maildir, url, statedir): if headers: data = [] for header in headers: - if header[0] in ["content-md5", "etag", "last-modified", "content-length"]: + if header[0] in \ + ["content-md5", "etag", "last-modified", "content-length"]: data.append((header[0], header[1])) if len(data) > 0: data = urllib.urlencode(data)