from base64 import b64encode
+import chardet
+
if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
import hashlib as md5
else:
elif tag_name == u'a':
for attr in attrs:
if attr[0].lower() == u'href':
- self.urls.append(attr[1].decode('utf-8'))
+ self.urls.append(attr[1])
self.curdata = self.curdata + u'`'
self.opentags.append(tag_name)
return
for attr in attrs:
if attr[0] == 'alt':
if isinstance(attr[1], str):
- alt = u'%s' %(attr[1].decode("utf-8"))
+ alt = u'%s' %(attr[1])
else:
alt = attr[1]
elif attr[0] == 'src':
if isinstance(attr[1], str):
- url = u'%s' %(attr[1].decode("utf-8"))
+ url = u'%s' %(attr[1])
else:
url = attr[1]
if url:
def handle_data(self, data):
if len(self.opentags) == 0:
self.opentags.append(u'p')
- self.curdata = self.curdata + data.decode("utf-8")
+ self.curdata = "%s%s" %(self.curdata, data)
def handle_charref(self, name):
try:
md5sum = md5.md5(content.encode("utf-8")).hexdigest()
+ # make sure content is unicode encoded
+ if not isinstance(content, unicode):
+ cd_res = chardet.detect(content)
+ chrset = cd_res['encoding']
+ print "detected charset %s for item %s" %(chrset, item["link"])
+ content = content.decode(chrset)
+
prevmessageid = None
db_guid_key = None
title = item["title"]
title = re.sub(u'<', u'<', title)
title = re.sub(u'>', u'>', title)
- subj_gen.feed(title.encode("utf-8"))
+ subj_gen.feed(title)
msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
item["link"] )
htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
textparser = HTML2Text()
- textparser.feed(content.encode("utf-8"))
+ textparser.feed(content)
textcontent = textparser.gettext()
textcontent = "%s\n\nItem URL: %s" %( \
textcontent, \