From: Brett Parker Date: Sun, 25 Aug 2013 09:26:16 +0000 (+0100) Subject: Make sure that we feed the parser unicode data X-Git-Url: https://git.sommitrealweird.co.uk//gitweb/?p=rss2maildir.git;a=commitdiff_plain;h=11a4af2e943412502f127ff9a00fb7662fdd94b9 Make sure that we feed the parser unicode data - Check if we get a unicode string back from feedparser - If not, re-encode so that it is - Remove explicit casting to utf-8 before it goes to the parser --- diff --git a/rss2maildir.py b/rss2maildir.py index a9abe4d..e327bc3 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -40,6 +40,8 @@ from ConfigParser import SafeConfigParser from base64 import b64encode +import chardet + if sys.version_info[0] == 2 and sys.version_info[1] >= 6: import hashlib as md5 else: @@ -275,7 +277,7 @@ class HTML2Text(HTMLParser): elif tag_name == u'a': for attr in attrs: if attr[0].lower() == u'href': - self.urls.append(attr[1].decode('utf-8')) + self.urls.append(attr[1]) self.curdata = self.curdata + u'`' self.opentags.append(tag_name) return @@ -308,12 +310,12 @@ class HTML2Text(HTMLParser): for attr in attrs: if attr[0] == 'alt': if isinstance(attr[1], str): - alt = u'%s' %(attr[1].decode("utf-8")) + alt = u'%s' %(attr[1]) else: alt = attr[1] elif attr[0] == 'src': if isinstance(attr[1], str): - url = u'%s' %(attr[1].decode("utf-8")) + url = u'%s' %(attr[1]) else: url = attr[1] if url: @@ -566,7 +568,7 @@ class HTML2Text(HTMLParser): def handle_data(self, data): if len(self.opentags) == 0: self.opentags.append(u'p') - self.curdata = self.curdata + data.decode("utf-8") + self.curdata = "%s%s" %(self.curdata, data) def handle_charref(self, name): try: @@ -707,6 +709,13 @@ def parse_and_deliver(maildir, url, statedir): md5sum = md5.md5(content.encode("utf-8")).hexdigest() + # make sure content is unicode encoded + if not isinstance(content, unicode): + cd_res = chardet.detect(content) + chrset = cd_res['encoding'] + print "detected charset %s for item %s" %(chrset, item["link"]) + content = content.decode(chrset) + prevmessageid = None db_guid_key = None @@ -765,7 +774,7 @@ def parse_and_deliver(maildir, url, statedir): title = item["title"] title = re.sub(u'<', u'<', title) title = re.sub(u'>', u'>', title) - subj_gen.feed(title.encode("utf-8")) + subj_gen.feed(title) msg.add_header("Subject", subj_gen.gettext()) msg.set_default_type("text/plain") @@ -776,7 +785,7 @@ def parse_and_deliver(maildir, url, statedir): item["link"] ) htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8") textparser = HTML2Text() - textparser.feed(content.encode("utf-8")) + textparser.feed(content) textcontent = textparser.gettext() textcontent = "%s\n\nItem URL: %s" %( \ textcontent, \