* Make entities case sensitive
* Add unittest for simple check of entities
* Add escaping of subject line
def handle_entityref(self, name):
entity = name
def handle_entityref(self, name):
entity = name
- if HTML2Text.entities.has_key(name.lower()):
- entity = HTML2Text.entities[name.lower()]
+ if HTML2Text.entities.has_key(name):
+ entity = HTML2Text.entities[name]
elif name[0] == "#":
entity = unichr(int(name[1:]))
else:
entity = "&" + name + ";"
elif name[0] == "#":
entity = unichr(int(name[1:]))
else:
entity = "&" + name + ";"
- self.curdata = self.curdata + unicode(entity, "utf-8")
+ self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+ "utf-8")
def gettext(self):
self.handle_curdata()
def gettext(self):
self.handle_curdata()
except:
pass
msg.add_header("Date", createddate)
except:
pass
msg.add_header("Date", createddate)
- msg.add_header("Subject", item["title"])
+ subj_gen = HTML2Text()
+ subj_gen.feed(item["title"].encod("utf-8"))
+ msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
htmlcontent = content.encode("utf-8")
msg.set_default_type("text/plain")
htmlcontent = content.encode("utf-8")
--- /dev/null
+<p>áÞö</p>
--- /dev/null
+#!/usr/bin/python
+
+import unittest
+import os
+
+import ParsingTests
+
+class EntityTests(ParsingTests.ParsingTest):
+ def testEntities(self):
+ return self.runParsingTest("entities")
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(SpacingTests("testEntities"))
+ return suite
+
+if __name__ == "__main__":
+ unittest.main()
except:
self.assert_(False)
input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1])
except:
self.assert_(False)
input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1])
- input = open(os.path.join(input_path, "html", filename + ".html")).read()
- expectedoutput = open(os.path.join(input_path, "expected", filename + ".txt")).read()
+ input = unicode(open(os.path.join(input_path, "html", filename + ".html")).read(), 'utf-8')
+ expectedoutput = unicode(open(os.path.join(input_path, "expected", filename + ".txt")).read(), 'utf-8')
parser = HTML2Text()
parser.feed(input)
output = parser.gettext()
parser = HTML2Text()
parser.feed(input)
output = parser.gettext()