X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/fcaedcaf414720ca8aac9fff2d12c45edd929a43..8488a0a338597575257a677f34b4db2754c33927:/rss2maildir.py diff --git a/rss2maildir.py b/rss2maildir.py index 59e9361..533e34d 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -44,6 +44,8 @@ import md5 import cgi import dbm +import re + from HTMLParser import HTMLParser class HTML2Text(HTMLParser): @@ -137,6 +139,39 @@ class HTML2Text(HTMLParser): u'THORN': u'Þ', u'eth': u'ð', u'ETH': u'Ð', + u'mdash': u'—', + u'ndash': u'–', + u'sect': u'§', + u'para': u'¶', + u'uarr': u'↑', + u'darr': u'↓', + u'larr': u'←', + u'rarr': u'→', + u'dagger': u'†', + u'Dagger': u'‡', + u'permil': u'‰', + u'prod': u'∏', + u'infin': u'∞', + u'radic': u'√', + u'there4': u'∴', + u'int': u'∫', + u'asymp': u'≈', + u'ne': u'≠', + u'equiv': '≡', + u'le': u'≤', + u'ge': u'≥', + u'loz': u'⋄', + u'sum': u'∑', + u'part': u'∂', + u'prime': u'′', + u'Prime': u'″', + u'harr': u'↔', + u'micro': u'µ', + u'not': u'¬', + u'plusmn': u'±', + u'divide': u'÷', + u'cent': u'¢', + u'euro': u'€', } blockleveltags = [ @@ -155,7 +190,7 @@ class HTML2Text(HTMLParser): u'dt', u'dd', u'div', - #u'blockquote', + u'blockquote', ] liststarttags = [ @@ -197,6 +232,9 @@ class HTML2Text(HTMLParser): self.listcount.append(1) self.listlevel = len(self.listcount) - 1 + if tag_name == u'dl': + self.indentlevel = self.indentlevel + 4 + if tag_name in self.liststarttags: smallist = self.opentags[-3:-1] smallist.reverse() @@ -277,12 +315,11 @@ class HTML2Text(HTMLParser): else: while self.images.has_key(alt): alt = alt + "_" - self.images[alt]["url"] = url + self.images[alt] = {"url": url} self.curdata = self.curdata \ + u'|%s|' %(alt,) else: - self.images[alt] = {} - self.images[alt]["url"] = url + self.images[alt] = {"url": url} self.curdata = self.curdata \ + u'|%s|' %(alt,) else: @@ -372,11 +409,11 @@ class HTML2Text(HTMLParser): quote = unicode( \ " ".join(self.curdata.encode("utf-8").strip().split()), \ "utf-8") - seperator = u'\n' + u' ' * self.indentlevel + u'> ' + seperator = u'\n' + u' ' * self.indentlevel + u' ' if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n' self.text = self.text \ - + u'> ' \ + + u' ' \ + seperator.join( \ textwrap.wrap( \ quote, \ @@ -430,12 +467,12 @@ class HTML2Text(HTMLParser): self.text = self.text + u'\n\n' elif len(self.text) > 1 and self.text[-2] != u'\n': self.text = self.text + u'\n' - definition = u' ' * self.indentlevel + definition + "::" - indentstring = u'\n' + u' ' * (self.indentlevel + 1) + definition = u' ' * (self.indentlevel - 4) + definition + "::" + indentstring = u'\n' + u' ' * (self.indentlevel - 3) self.text = self.text \ + indentstring.join( textwrap.wrap(definition, \ - self.textwidth - self.indentlevel - 1)) + self.textwidth - self.indentlevel - 4)) self.curdata = u'' elif tag_thats_done == u'dd': definition = unicode(" ".join( \ @@ -444,13 +481,13 @@ class HTML2Text(HTMLParser): if len(definition) > 0: if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n' - indentstring = u'\n' + u' ' * (self.indentlevel + 4) + indentstring = u'\n' + u' ' * self.indentlevel self.text = self.text \ - + u' ' * (self.indentlevel + 4) \ + + indentstring \ + indentstring.join( \ textwrap.wrap( \ definition, \ - self.textwidth - self.indentlevel - 4 \ + self.textwidth - self.indentlevel \ ) \ ) self.curdata = u'' @@ -479,8 +516,11 @@ class HTML2Text(HTMLParser): if tag in [u'br', u'img']: return + if tag == u'dl': + self.indentlevel = self.indentlevel - 4 + if tag in self.liststarttags: - if tag in [u'ol', u'dl', u'ul']: + if tag in [u'ol', u'dl', u'ul', u'dd']: self.handle_curdata() # find if there was a previous list level smalllist = self.opentags[:-1] @@ -518,12 +558,24 @@ class HTML2Text(HTMLParser): self.opentags.append(u'p') self.curdata = self.curdata + data.decode("utf-8") + def handle_charref(self, name): + try: + entity = unichr(int(name)) + except: + if name[0] == 'x': + try: + entity = unichr(int('0%s' %(name,), 16)) + except: + entity = u'#%s' %(name,) + else: + entity = u'#%s' %(name,) + self.curdata = self.curdata + unicode(entity.encode('utf-8'), \ + "utf-8") + def handle_entityref(self, name): entity = name if HTML2Text.entities.has_key(name): entity = HTML2Text.entities[name] - elif name[0] == "#": - entity = unichr(int(name[1:])) else: entity = "&" + name + ";" @@ -544,7 +596,7 @@ class HTML2Text(HTMLParser): self.urls = [] if len(self.images.keys()) > 0: self.text = self.text + u'\n.. ' \ - + u'.. '.join( \ + + u'\n.. '.join( \ ["|%s| image:: %s" %(a, self.images[a]["url"]) \ for a in self.images.keys()]) + u'\n' self.images = {} @@ -631,23 +683,29 @@ def parse_and_deliver(maildir, url, statedir): if item.has_key("content"): content = item["content"][0]["value"] else: - content = item["summary"] + if item.has_key("description"): + content = item["description"] + else: + content = u'' md5sum = md5.md5(content.encode("utf-8")).hexdigest() prevmessageid = None + db_guid_key = (url + u'|' + item["guid"]).encode("utf-8") + db_link_key = (url + u'|' + item["link"]).encode("utf-8") + # check if there's a guid too - if that exists and we match the md5, # return if item.has_key("guid"): - if db.has_key(url + "|" + item["guid"]): - data = db[url + "|" + item["guid"]] + if db.has_key(db_guid_key): + data = db[db_guid_key] data = cgi.parse_qs(data) if data["contentmd5"][0] == md5sum: continue - if db.has_key(url + "|" + item["link"]): - data = db[url + "|" + item["link"]] + if db.has_key(db_link_key): + data = db[db_link_key] data = cgi.parse_qs(data) if data.has_key("message-id"): prevmessageid = data["message-id"][0] @@ -684,7 +742,10 @@ def parse_and_deliver(maildir, url, statedir): pass msg.add_header("Date", createddate) subj_gen = HTML2Text() - subj_gen.feed(item["title"].encode("utf-8")) + title = item["title"] + title = re.sub(u'<', u'<', title) + title = re.sub(u'>', u'>', title) + subj_gen.feed(title.encode("utf-8")) msg.add_header("Subject", subj_gen.gettext()) msg.set_default_type("text/plain") @@ -732,25 +793,25 @@ def parse_and_deliver(maildir, url, statedir): ("created", createddate), \ ("contentmd5", md5sum) \ )) - db[url + "|" + item["guid"]] = data + db[db_guid_key] = data try: - data = db[url + "|" + item["link"]] + data = db[db_link_key] data = cgi.parse_qs(data) newdata = urllib.urlencode(( \ ("message-id", messageid), \ ("created", data["created"][0]), \ ("contentmd5", data["contentmd5"][0]) \ )) - db[url + "|" + item["link"]] = newdata + db[db_link_key] = newdata except: - db[url + "|" + item["link"]] = data + db[db_link_key] = data else: data = urllib.urlencode(( \ ("message-id", messageid), \ ("created", createddate), \ ("contentmd5", md5sum) \ )) - db[url + "|" + item["link"]] = data + db[db_link_key] = data if headers: data = []