X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/13a417ff459bfe827f45845f9ca5e04c08889e87..2d1b58f010d4f239f66222f28a7869348112c2b4:/rss2maildir.py?ds=inline diff --git a/rss2maildir.py b/rss2maildir.py index df7236a..dce6c3c 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -197,6 +197,9 @@ class HTML2Text(HTMLParser): self.listcount.append(1) self.listlevel = len(self.listcount) - 1 + if tag_name == u'dl': + self.indentlevel = self.indentlevel + 4 + if tag_name in self.liststarttags: smallist = self.opentags[-3:-1] smallist.reverse() @@ -430,12 +433,12 @@ class HTML2Text(HTMLParser): self.text = self.text + u'\n\n' elif len(self.text) > 1 and self.text[-2] != u'\n': self.text = self.text + u'\n' - definition = u' ' * self.indentlevel + definition + "::" - indentstring = u'\n' + u' ' * (self.indentlevel + 1) + definition = u' ' * (self.indentlevel - 4) + definition + "::" + indentstring = u'\n' + u' ' * (self.indentlevel - 3) self.text = self.text \ + indentstring.join( textwrap.wrap(definition, \ - self.textwidth - self.indentlevel - 1)) + self.textwidth - self.indentlevel - 4)) self.curdata = u'' elif tag_thats_done == u'dd': definition = unicode(" ".join( \ @@ -444,13 +447,13 @@ class HTML2Text(HTMLParser): if len(definition) > 0: if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n' - indentstring = u'\n' + u' ' * (self.indentlevel + 4) + indentstring = u'\n' + u' ' * self.indentlevel self.text = self.text \ - + u' ' * (self.indentlevel + 4) \ + + indentstring \ + indentstring.join( \ textwrap.wrap( \ definition, \ - self.textwidth - self.indentlevel - 4 \ + self.textwidth - self.indentlevel \ ) \ ) self.curdata = u'' @@ -479,8 +482,11 @@ class HTML2Text(HTMLParser): if tag in [u'br', u'img']: return + if tag == u'dl': + self.indentlevel = self.indentlevel - 4 + if tag in self.liststarttags: - if tag in [u'ol', u'dl', u'ul']: + if tag in [u'ol', u'dl', u'ul', u'dd']: self.handle_curdata() # find if there was a previous list level smalllist = self.opentags[:-1] @@ -520,14 +526,15 @@ class HTML2Text(HTMLParser): def handle_entityref(self, name): entity = name - if HTML2Text.entities.has_key(name.lower()): - entity = HTML2Text.entities[name.lower()] + if HTML2Text.entities.has_key(name): + entity = HTML2Text.entities[name] elif name[0] == "#": entity = unichr(int(name[1:])) else: entity = "&" + name + ";" - self.curdata = self.curdata + unicode(entity, "utf-8") + self.curdata = self.curdata + unicode(entity.encode('utf-8'), \ + "utf-8") def gettext(self): self.handle_curdata() @@ -543,7 +550,7 @@ class HTML2Text(HTMLParser): self.urls = [] if len(self.images.keys()) > 0: self.text = self.text + u'\n.. ' \ - + u'.. '.join( \ + + u'\n.. '.join( \ ["|%s| image:: %s" %(a, self.images[a]["url"]) \ for a in self.images.keys()]) + u'\n' self.images = {} @@ -682,7 +689,9 @@ def parse_and_deliver(maildir, url, statedir): except: pass msg.add_header("Date", createddate) - msg.add_header("Subject", item["title"]) + subj_gen = HTML2Text() + subj_gen.feed(item["title"].encode("utf-8")) + msg.add_header("Subject", subj_gen.gettext()) msg.set_default_type("text/plain") htmlcontent = content.encode("utf-8")