From b85e3c712d2cc41730c1f0bb619b58663b6b7b48 Mon Sep 17 00:00:00 2001 From: Brett Parker Date: Sun, 2 Mar 2008 12:27:13 +0000 Subject: [PATCH] Entity handling fixes * Make entities case sensitive * Add unittest for simple check of entities * Add escaping of subject line --- rss2maildir.py | 11 +++++++---- tests/expected/entities.txt | 1 + tests/html/entities.html | 1 + tests/unittests/EntityTests.py | 18 ++++++++++++++++++ tests/unittests/ParsingTests.py | 4 ++-- 5 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 tests/expected/entities.txt create mode 100644 tests/html/entities.html create mode 100755 tests/unittests/EntityTests.py diff --git a/rss2maildir.py b/rss2maildir.py index df7236a..0dfd0f0 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -520,14 +520,15 @@ class HTML2Text(HTMLParser): def handle_entityref(self, name): entity = name - if HTML2Text.entities.has_key(name.lower()): - entity = HTML2Text.entities[name.lower()] + if HTML2Text.entities.has_key(name): + entity = HTML2Text.entities[name] elif name[0] == "#": entity = unichr(int(name[1:])) else: entity = "&" + name + ";" - self.curdata = self.curdata + unicode(entity, "utf-8") + self.curdata = self.curdata + unicode(entity.encode('utf-8'), \ + "utf-8") def gettext(self): self.handle_curdata() @@ -682,7 +683,9 @@ def parse_and_deliver(maildir, url, statedir): except: pass msg.add_header("Date", createddate) - msg.add_header("Subject", item["title"]) + subj_gen = HTML2Text() + subj_gen.feed(item["title"].encod("utf-8")) + msg.add_header("Subject", subj_gen.gettext()) msg.set_default_type("text/plain") htmlcontent = content.encode("utf-8") diff --git a/tests/expected/entities.txt b/tests/expected/entities.txt new file mode 100644 index 0000000..20e85cd --- /dev/null +++ b/tests/expected/entities.txt @@ -0,0 +1 @@ +áÞö diff --git a/tests/html/entities.html b/tests/html/entities.html new file mode 100644 index 0000000..77fd2ca --- /dev/null +++ b/tests/html/entities.html @@ -0,0 +1 @@ +

áÞö

diff --git a/tests/unittests/EntityTests.py b/tests/unittests/EntityTests.py new file mode 100755 index 0000000..b317f4d --- /dev/null +++ b/tests/unittests/EntityTests.py @@ -0,0 +1,18 @@ +#!/usr/bin/python + +import unittest +import os + +import ParsingTests + +class EntityTests(ParsingTests.ParsingTest): + def testEntities(self): + return self.runParsingTest("entities") + +def suite(): + suite = unittest.TestSuite() + suite.addTest(SpacingTests("testEntities")) + return suite + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unittests/ParsingTests.py b/tests/unittests/ParsingTests.py index 20fc521..fac5ad4 100755 --- a/tests/unittests/ParsingTests.py +++ b/tests/unittests/ParsingTests.py @@ -18,8 +18,8 @@ class ParsingTest(unittest.TestCase): except: self.assert_(False) input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) - input = open(os.path.join(input_path, "html", filename + ".html")).read() - expectedoutput = open(os.path.join(input_path, "expected", filename + ".txt")).read() + input = unicode(open(os.path.join(input_path, "html", filename + ".html")).read(), 'utf-8') + expectedoutput = unicode(open(os.path.join(input_path, "expected", filename + ".txt")).read(), 'utf-8') parser = HTML2Text() parser.feed(input) output = parser.gettext() -- 2.39.5