X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/fb3a7e40f413322d90b5452acc7c3a8f09046661..e606cd7a8357ae41b20bd90a533abc7417c315a6:/rss2maildir.py diff --git a/rss2maildir.py b/rss2maildir.py index 99735a7..322dc57 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -58,7 +58,7 @@ class HTML2Text(HTMLParser): "nbsp": " ", } - def __init__(self): + def __init__(self,textwidth=70): self.inheadingone = False self.inheadingtwo = False self.inotherheading = False @@ -73,6 +73,7 @@ class HTML2Text(HTMLParser): self.inul = False self.initem = False self.item = u'' + self.textwidth = textwidth HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): @@ -97,7 +98,7 @@ class HTML2Text(HTMLParser): self.text = self.text + u'\n\n' if self.inparagraph: self.text = self.text \ - + u'\n'.join(textwrap.wrap(self.currentparagraph, 70)) + + u'\n'.join(textwrap.wrap(self.currentparagraph, self.textwidth)) self.currentparagraph = u'' self.inparagraph = True elif tag.lower() == "pre": @@ -109,7 +110,7 @@ class HTML2Text(HTMLParser): self.item = u'' self.inul = True self.text = self.text + "\n" - elif tag.lower() == "li" and self.inul: + elif tag.lower() == "li": if not self.initem: self.initem = True self.item = u'' @@ -117,9 +118,10 @@ class HTML2Text(HTMLParser): self.text = self.text \ + u' * ' \ + u'\n '.join([a.strip() for a in \ - textwrap.wrap(self.item, 67)]) \ + textwrap.wrap(self.item, self.textwidth - 3)]) \ + u'\n' self.item = u'' + self.initem = True def handle_startendtag(self, tag, attrs): if tag.lower() == "br": @@ -131,7 +133,7 @@ class HTML2Text(HTMLParser): + u'\n'.join( \ [a \ for a in textwrap.wrap( \ - self.currentparagraph, 70) \ + self.currentparagraph, self.textwidth) \ ] \ ) \ + u'\n' @@ -179,7 +181,7 @@ class HTML2Text(HTMLParser): elif tag.lower() == "p": self.text = self.text \ + u'\n'.join(textwrap.wrap( \ - self.currentparagraph, 70) \ + self.currentparagraph, self.textwidth) \ ) self.inparagraph = False self.currentparagraph = u'' @@ -189,7 +191,7 @@ class HTML2Text(HTMLParser): + u'\n> '.join( \ [a.strip() \ for a in textwrap.wrap( \ - self.blockquote, 68)] \ + self.blockquote, self.textwidth - 2)] \ ) \ + u'\n' self.inblockquote = False @@ -198,11 +200,11 @@ class HTML2Text(HTMLParser): self.inpre = False elif tag.lower() == "li": self.initem = False - if self.item != "": + if self.item != u'': self.text = self.text \ + u' * ' \ + u'\n '.join( \ - [a.strip() for a in textwrap.wrap(self.item, 67)]) \ + [a.strip() for a in textwrap.wrap(self.item, self.textwidth - 3)]) \ + u'\n' self.item = u'' elif tag.lower() == "ul": @@ -217,16 +219,18 @@ class HTML2Text(HTMLParser): self.blockquote = self.blockquote \ + unicode(data, "utf-8").strip() \ + u' ' + elif self.initem: + self.item = self.item + unicode(data, "utf-8") elif self.inparagraph: self.currentparagraph = self.currentparagraph \ + unicode(data, "utf-8").strip() \ + u' ' - elif self.inul and self.initem: - self.item = self.item + unicode(data, "utf-8") elif self.inpre: self.text = self.text + unicode(data, "utf-8") else: - self.text = self.text + unicode(data, "utf-8").strip() + u' ' + isallwhitespace = data.strip() == "" + if not isallwhitespace: + self.text = self.text + unicode(data, "utf-8").strip() + u' ' def handle_entityref(self, name): entity = name @@ -248,7 +252,9 @@ class HTML2Text(HTMLParser): def gettext(self): data = self.text if self.inparagraph: - data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70)) + data = data + "\n".join(textwrap.wrap(self.currentparagraph, self.textwidth)) + if data[-1] != '\n': + data = data + '\n' return data def open_url(method, url):