Entity handling fixes
authorBrett Parker <iDunno@sommitrealweird.co.uk>
Sun, 2 Mar 2008 12:27:13 +0000 (12:27 +0000)
committerBrett Parker <iDunno@sommitrealweird.co.uk>
Sun, 2 Mar 2008 12:27:13 +0000 (12:27 +0000)
* Make entities case sensitive
* Add unittest for simple check of entities
* Add escaping of subject line

rss2maildir.py
tests/expected/entities.txt [new file with mode: 0644]
tests/html/entities.html [new file with mode: 0644]
tests/unittests/EntityTests.py [new file with mode: 0755]
tests/unittests/ParsingTests.py

index df7236a2bcd65421c6ecf5f69a7b429421ceb773..0dfd0f04b0f035224a900ab94d73e400aa2d82e9 100755 (executable)
@@ -520,14 +520,15 @@ class HTML2Text(HTMLParser):
 
     def handle_entityref(self, name):
         entity = name
-        if HTML2Text.entities.has_key(name.lower()):
-            entity = HTML2Text.entities[name.lower()]
+        if HTML2Text.entities.has_key(name):
+            entity = HTML2Text.entities[name]
         elif name[0] == "#":
             entity = unichr(int(name[1:]))
         else:
             entity = "&" + name + ";"
 
-        self.curdata = self.curdata + unicode(entity, "utf-8")
+        self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+            "utf-8")
 
     def gettext(self):
         self.handle_curdata()
@@ -682,7 +683,9 @@ def parse_and_deliver(maildir, url, statedir):
         except:
             pass
         msg.add_header("Date", createddate)
-        msg.add_header("Subject", item["title"])
+        subj_gen = HTML2Text()
+        subj_gen.feed(item["title"].encod("utf-8"))
+        msg.add_header("Subject", subj_gen.gettext())
         msg.set_default_type("text/plain")
 
         htmlcontent = content.encode("utf-8")
diff --git a/tests/expected/entities.txt b/tests/expected/entities.txt
new file mode 100644 (file)
index 0000000..20e85cd
--- /dev/null
@@ -0,0 +1 @@
+áÞö
diff --git a/tests/html/entities.html b/tests/html/entities.html
new file mode 100644 (file)
index 0000000..77fd2ca
--- /dev/null
@@ -0,0 +1 @@
+<p>&aacute;&THORN;&ouml;</p>
diff --git a/tests/unittests/EntityTests.py b/tests/unittests/EntityTests.py
new file mode 100755 (executable)
index 0000000..b317f4d
--- /dev/null
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+
+import unittest
+import os
+
+import ParsingTests
+
+class EntityTests(ParsingTests.ParsingTest):
+    def testEntities(self):
+        return self.runParsingTest("entities")
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(SpacingTests("testEntities"))
+    return suite
+
+if __name__ == "__main__":
+    unittest.main()
index 20fc521370b8285e0b30e5aba237db403bd47742..fac5ad4d7349a672ffd265b2eb5c2c949b0a1883 100755 (executable)
@@ -18,8 +18,8 @@ class ParsingTest(unittest.TestCase):
             except:
                 self.assert_(False)
         input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1])
-        input = open(os.path.join(input_path, "html", filename + ".html")).read()
-        expectedoutput = open(os.path.join(input_path, "expected", filename + ".txt")).read()
+        input = unicode(open(os.path.join(input_path, "html", filename + ".html")).read(), 'utf-8')
+        expectedoutput = unicode(open(os.path.join(input_path, "expected", filename + ".txt")).read(), 'utf-8')
         parser = HTML2Text()
         parser.feed(input)
         output = parser.gettext()