From: Brett Parker Date: Sun, 6 Jan 2008 11:43:44 +0000 (+0000) Subject: * Small improvements to the HTML2Text code X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/commitdiff_plain/f47e451a2ae7cde0b422266b10b131d00d398e87?ds=sidebyside;hp=286e096cd216854c5a9bc8cdc580e5d3ec447f19 * Small improvements to the HTML2Text code * Reorganize unittests for parsing to make it easier to add more tests later --- diff --git a/rss2maildir.py b/rss2maildir.py index 739c1f3..a51209c 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -123,6 +123,9 @@ class HTML2Text(HTMLParser): if len(self.opentags) == 0: return + if len(self.curdata) == 0: + return + tag_thats_done = self.opentags[-1] if tag_thats_done in self.blockleveltags: @@ -169,6 +172,26 @@ class HTML2Text(HTMLParser): + u'\n '.join( \ textwrap.wrap(item, self.textwidth - 3)) self.curdata = u'' + elif tag_thats_done == "dt": + definition = self.curdata.encode("utf-8").strip() + if len(self.text) > 0 and self.text[-1] != u'\n': + self.text = self.text + u'\n\n' + elif len(self.text) > 0 and self.text[-2] != u'\n': + self.text = self.text + u'\n' + definition = definition + "::" + self.text = self.text \ + + '\n '.join( + textwrap.wrap(definition, self.textwidth - 1)) + self.curdata = u'' + elif tag_thats_done == "dd": + definition = self.curdata.encode("utf-8").strip() + if len(self.text) > 0 and self.text[-1] != u'\n': + self.text = self.text + u'\n' + self.text = self.text \ + + ' ' \ + + '\n '.join( \ + textwrap.wrap(definition, self.textwidth - 4)) + self.curdata = u'' elif tag_thats_done in self.liststarttags: pass else: @@ -224,6 +247,10 @@ class HTML2Text(HTMLParser): if len(self.text) == 0 or self.text[-1] != u'\n': self.text = self.text + u'\n' self.opentags = [] + if len(self.text) > 0: + while len(self.text) > 1 and self.text[-1] == u'\n': + self.text = self.text[:-1] + self.text = self.text + u'\n' return self.text def open_url(method, url): diff --git a/tests/expected/definitionlist-badlyformed.txt b/tests/expected/definitionlist-badlyformed.txt new file mode 100644 index 0000000..3d9e83f --- /dev/null +++ b/tests/expected/definitionlist-badlyformed.txt @@ -0,0 +1,6 @@ +An item:: + It's definition + +Another item:: + And it's got a much longer definition because we like to make sure + that we've got the test wrapping right don't we. diff --git a/tests/expected/definitionlist-wellformed.txt b/tests/expected/definitionlist-wellformed.txt index 98beb1b..3d9e83f 100644 --- a/tests/expected/definitionlist-wellformed.txt +++ b/tests/expected/definitionlist-wellformed.txt @@ -1,6 +1,6 @@ -An item +An item:: It's definition -Another item +Another item:: And it's got a much longer definition because we like to make sure that we've got the test wrapping right don't we. diff --git a/tests/html/definitionlist-badlyformed.html b/tests/html/definitionlist-badlyformed.html new file mode 100644 index 0000000..562bad4 --- /dev/null +++ b/tests/html/definitionlist-badlyformed.html @@ -0,0 +1,6 @@ +
+
An item +
It's definition +
Another item +
And it's got a much longer definition because we like to make sure that we've got the test wrapping right don't we. +
diff --git a/tests/unittests/DefinitionListTests.py b/tests/unittests/DefinitionListTests.py index 9058973..a1abae0 100755 --- a/tests/unittests/DefinitionListTests.py +++ b/tests/unittests/DefinitionListTests.py @@ -1,33 +1,21 @@ #!/usr/bin/python import unittest -import sys import os -class DefinitionListTests(unittest.TestCase): - def setUp(self): - self.inputpath = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) +import ParsingTests +class DefinitionListTests(ParsingTests.ParsingTest): def testWellFormedDefinitionList(self): - try: - from rss2maildir import HTML2Text - except: - sys.path.append(os.path.sep.join(self.inputpath.split(os.path.sep)[0:-1])) - try: - from rss2maildir import HTML2Text - except: - self.assert_(False) - input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) - input = open(os.path.join(input_path, "html", "definitionlist-wellformed.html")).read() - expectedoutput = open(os.path.join(input_path, "expected", "definitionlist-wellformed.txt")).read() - parser = HTML2Text() - parser.feed(input) - output = parser.gettext() - self.assertEqual(output, expectedoutput) + return self.runParsingTest("definitionlist-wellformed") + + def testBadlyFormedDefinitionList(self): + return self.runParsingTest("definitionlist-badlyformed") def suite(): suite = unittest.TestSuite() suite.addTest(DefinitionListTests("testWellFormedDefinitionList")) + suite.addTest(DefinitionListTests("testBadlyFormedDefinitionList")) return suite if __name__ == "__main__": diff --git a/tests/unittests/ParagraphTests.py b/tests/unittests/ParagraphTests.py index 9111ecd..8b98a53 100755 --- a/tests/unittests/ParagraphTests.py +++ b/tests/unittests/ParagraphTests.py @@ -4,26 +4,11 @@ import unittest import sys import os -class ParagraphTests(unittest.TestCase): - def setUp(self): - self.inputpath = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) +import ParsingTests +class ParagraphTests(ParsingTests.ParsingTest): def testWellFormedParagraphs(self): - try: - from rss2maildir import HTML2Text - except: - sys.path.append(os.path.sep.join(self.inputpath.split(os.path.sep)[0:-1])) - try: - from rss2maildir import HTML2Text - except: - self.assert_(False) - input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) - input = open(os.path.join(input_path, "html", "multiparagraph-wellformed.html")).read() - expectedoutput = open(os.path.join(input_path, "expected", "multiparagraph-wellformed.txt")).read() - parser = HTML2Text() - parser.feed(input) - output = parser.gettext() - self.assertEqual(output, expectedoutput) + return self.runParsingTest("multiparagraph-wellformed") def suite(): suite = unittest.TestSuite() diff --git a/tests/unittests/ParsingTests.py b/tests/unittests/ParsingTests.py new file mode 100755 index 0000000..20fc521 --- /dev/null +++ b/tests/unittests/ParsingTests.py @@ -0,0 +1,26 @@ +#!/usr/bin/python + +import unittest +import sys +import os + +class ParsingTest(unittest.TestCase): + def setUp(self): + self.inputpath = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) + + def runParsingTest(self, filename): + try: + from rss2maildir import HTML2Text + except: + sys.path.append(os.path.sep.join(self.inputpath.split(os.path.sep)[0:-1])) + try: + from rss2maildir import HTML2Text + except: + self.assert_(False) + input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) + input = open(os.path.join(input_path, "html", filename + ".html")).read() + expectedoutput = open(os.path.join(input_path, "expected", filename + ".txt")).read() + parser = HTML2Text() + parser.feed(input) + output = parser.gettext() + self.assertEqual(output, expectedoutput) diff --git a/tests/unittests/UnorderedListTests.py b/tests/unittests/UnorderedListTests.py index 2224e74..a495461 100755 --- a/tests/unittests/UnorderedListTests.py +++ b/tests/unittests/UnorderedListTests.py @@ -4,44 +4,14 @@ import unittest import sys import os -class UnorderedListTests(unittest.TestCase): - def setUp(self): - self.inputpath = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) +import ParsingTests +class UnorderedListTests(ParsingTests.ParsingTest): def testWellFormedList(self): - try: - from rss2maildir import HTML2Text - except: - sys.path.append(os.path.sep.join(self.inputpath.split(os.path.sep)[0:-1])) - try: - from rss2maildir import HTML2Text - except: - self.assert_(False) - input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) - input = open(os.path.join(input_path, "html", "unorderedlist-wellformed.html")).read() - expectedoutput = open(os.path.join(input_path, "expected", "unorderedlist-wellformed.txt")).read() - parser = HTML2Text() - parser.feed(input) - output = parser.gettext() - self.assertEqual(output, expectedoutput) + return self.runParsingTest("unordered-wellformed") def testBadlyFormedList(self): - try: - from rss2maildir import HTML2Text - except: - sys.path.append(os.path.sep.join(self.inputpath.split(os.path.sep)[0:-1])) - try: - from rss2maildir import HTML2Text - except: - self.assert_(False) - - input_path = os.path.sep.join(os.path.dirname(os.path.realpath(__file__)).split(os.path.sep)[0:-1]) - input = open(os.path.join(input_path, "html", "unorderedlist-badlyformed.html")).read() - expectedoutput = open(os.path.join(input_path, "expected", "unorderedlist-badlyformed.txt")).read() - parser = HTML2Text() - parser.feed(input) - output = parser.gettext() - self.assertEqual(output, expectedoutput) + return self.runParsingTest("unordered-badlyformed") def suite(): suite = unittest.TestSuite()