]> git.sommitrealweird.co.uk Git - rss2maildir.git/blobdiff - rss2maildir.py
Fix for title parsing
[rss2maildir.git] / rss2maildir.py
index 47ba9dc125687576f8759aaeebe0a2e63cca43f4..ff2901c5ffab0414fa563975801c5e6fd76d315f 100755 (executable)
@@ -44,6 +44,8 @@ import md5
 import cgi
 import dbm
 
+import re
+
 from HTMLParser import HTMLParser
 
 class HTML2Text(HTMLParser):
@@ -556,12 +558,15 @@ class HTML2Text(HTMLParser):
             self.opentags.append(u'p')
         self.curdata = self.curdata + data.decode("utf-8")
 
+    def handle_charref(self, name):
+        entity = unichr(int(name))
+        self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+            "utf-8")
+
     def handle_entityref(self, name):
         entity = name
         if HTML2Text.entities.has_key(name):
             entity = HTML2Text.entities[name]
-        elif name[0] == "#":
-            entity = unichr(int(name[1:]))
         else:
             entity = "&" + name + ";"
 
@@ -722,7 +727,10 @@ def parse_and_deliver(maildir, url, statedir):
             pass
         msg.add_header("Date", createddate)
         subj_gen = HTML2Text()
-        subj_gen.feed(item["title"].encode("utf-8"))
+        title = item["title"].encode("utf-8")
+        title = re.sub(u'<', u'&lt;', title)
+        title = re.sub(u'>', u'&gt;', title)
+        subj_gen.feed(title)
         msg.add_header("Subject", subj_gen.gettext())
         msg.set_default_type("text/plain")