From 85df49e4c373d62bd5e9e6b45e1792a6d637ca25 Mon Sep 17 00:00:00 2001 From: Brett Parker Date: Sat, 1 Mar 2008 20:57:10 +0000 Subject: [PATCH 1/1] Normalise spaces where they should be. --- rss2maildir.py | 18 ++++++++++++------ tests/expected/non-normalised-spacing.txt | 4 ++++ tests/html/non-normalised-spacing.html | 7 +++++++ tests/unittests/SpacingTests.py | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+), 6 deletions(-) create mode 100644 tests/expected/non-normalised-spacing.txt create mode 100644 tests/html/non-normalised-spacing.html create mode 100755 tests/unittests/SpacingTests.py diff --git a/rss2maildir.py b/rss2maildir.py index 2af32bc..ce6c342 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -235,7 +235,7 @@ class HTML2Text(HTMLParser): if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]: underline = u'' underlinechar = u'=' - headingtext = self.curdata + headingtext = " ".join(self.curdata.split()) seperator = u'\n' + u' '*self.indentlevel headingtext = seperator.join( \ textwrap.wrap( \ @@ -260,7 +260,8 @@ class HTML2Text(HTMLParser): + underline elif tag_thats_done in [u'p', u'div']: paragraph = unicode( \ - self.curdata.strip().encode("utf-8"), "utf-8") + " ".join(self.curdata.strip().encode("utf-8").split()), \ + "utf-8") seperator = u'\n' + u' ' * self.indentlevel self.text = self.text \ + u' ' * self.indentlevel \ @@ -269,10 +270,11 @@ class HTML2Text(HTMLParser): paragraph, self.textwidth - self.indentlevel)) elif tag_thats_done == "pre": self.text = self.text + unicode( \ - self.curdata.encode("utf-8"), "utf-8") + " ".join(self.curdata.encode("utf-8").split()), "utf-8") elif tag_thats_done == u'blockquote': quote = unicode( \ - self.curdata.encode("utf-8").strip(), "utf-8") + " ".join(self.curdata.encode("utf-8").strip().split()), \ + "utf-8") seperator = u'\n' + u' ' * self.indentlevel + u'> ' if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n' @@ -324,7 +326,9 @@ class HTML2Text(HTMLParser): ) self.curdata = u'' elif tag_thats_done == u'dt': - definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8") + definition = unicode(" ".join( \ + self.curdata.encode("utf-8").strip().split()), \ + "utf-8") if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n\n' elif len(self.text) > 1 and self.text[-2] != u'\n': @@ -337,7 +341,9 @@ class HTML2Text(HTMLParser): self.textwidth - self.indentlevel - 1)) self.curdata = u'' elif tag_thats_done == u'dd': - definition = unicode(self.curdata.encode("utf-8").strip(), "utf-8") + definition = unicode(" ".join( \ + self.curdata.encode("utf-8").strip().split()), + "utf-8") if len(definition) > 0: if len(self.text) > 0 and self.text[-1] != u'\n': self.text = self.text + u'\n' diff --git a/tests/expected/non-normalised-spacing.txt b/tests/expected/non-normalised-spacing.txt new file mode 100644 index 0000000..50d8fef --- /dev/null +++ b/tests/expected/non-normalised-spacing.txt @@ -0,0 +1,4 @@ +This has some odd spacing +========================= + +It's really great and hopefully shouldn't be too bad over all diff --git a/tests/html/non-normalised-spacing.html b/tests/html/non-normalised-spacing.html new file mode 100644 index 0000000..0c85190 --- /dev/null +++ b/tests/html/non-normalised-spacing.html @@ -0,0 +1,7 @@ +

This has some odd spacing

+

It's really great and hopefully shouldn't +be + +too bad over +all +

diff --git a/tests/unittests/SpacingTests.py b/tests/unittests/SpacingTests.py new file mode 100755 index 0000000..849e6ba --- /dev/null +++ b/tests/unittests/SpacingTests.py @@ -0,0 +1,18 @@ +#!/usr/bin/python + +import unittest +import os + +import ParsingTests + +class SpacingTests(ParsingTests.ParsingTest): + def testNormalisingSpacing(self): + return self.runParsingTest("non-normalised-spacing") + +def suite(): + suite = unittest.TestSuite() + suite.addTest(SpacingTests("testNormalisingSpacing")) + return suite + +if __name__ == "__main__": + unittest.main() -- 2.39.5