X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/9337881574f8f9370fecabfeb9a52e4341568cd9..11a4af2e943412502f127ff9a00fb7662fdd94b9:/rss2maildir.py?ds=sidebyside diff --git a/rss2maildir.py b/rss2maildir.py index 9473dd0..e327bc3 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -39,11 +39,19 @@ from optparse import OptionParser from ConfigParser import SafeConfigParser from base64 import b64encode -import md5 + +import chardet + +if sys.version_info[0] == 2 and sys.version_info[1] >= 6: + import hashlib as md5 +else: + import md5 import cgi import dbm +import re + from HTMLParser import HTMLParser class HTML2Text(HTMLParser): @@ -269,7 +277,7 @@ class HTML2Text(HTMLParser): elif tag_name == u'a': for attr in attrs: if attr[0].lower() == u'href': - self.urls.append(attr[1].decode('utf-8')) + self.urls.append(attr[1]) self.curdata = self.curdata + u'`' self.opentags.append(tag_name) return @@ -301,9 +309,15 @@ class HTML2Text(HTMLParser): url = u'' for attr in attrs: if attr[0] == 'alt': - alt = attr[1].decode('utf-8') + if isinstance(attr[1], str): + alt = u'%s' %(attr[1]) + else: + alt = attr[1] elif attr[0] == 'src': - url = attr[1].decode('utf-8') + if isinstance(attr[1], str): + url = u'%s' %(attr[1]) + else: + url = attr[1] if url: if alt: if self.images.has_key(alt): @@ -554,10 +568,19 @@ class HTML2Text(HTMLParser): def handle_data(self, data): if len(self.opentags) == 0: self.opentags.append(u'p') - self.curdata = self.curdata + data.decode("utf-8") + self.curdata = "%s%s" %(self.curdata, data) def handle_charref(self, name): - entity = unichr(int(name)) + try: + entity = unichr(int(name)) + except: + if name[0] == 'x': + try: + entity = unichr(int('0%s' %(name,), 16)) + except: + entity = u'#%s' %(name,) + else: + entity = u'#%s' %(name,) self.curdata = self.curdata + unicode(entity.encode('utf-8'), \ "utf-8") @@ -597,10 +620,17 @@ def open_url(method, url): (type, rest) = urllib.splittype(url) (host, path) = urllib.splithost(rest) (host, port) = urllib.splitport(host) - if port == None: + if type == "https": + if port == None: + port = 443 + elif port == None: port = 80 try: - conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn = None + if type == "http": + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + else: + conn = httplib.HTTPSConnection("%s:%s" %(host, port)) conn.request(method, path) response = conn.getresponse() if response.status in [301, 302, 303, 307]: @@ -672,23 +702,37 @@ def parse_and_deliver(maildir, url, statedir): if item.has_key("content"): content = item["content"][0]["value"] else: - content = item["summary"] + if item.has_key("description"): + content = item["description"] + else: + content = u'' md5sum = md5.md5(content.encode("utf-8")).hexdigest() + # make sure content is unicode encoded + if not isinstance(content, unicode): + cd_res = chardet.detect(content) + chrset = cd_res['encoding'] + print "detected charset %s for item %s" %(chrset, item["link"]) + content = content.decode(chrset) + prevmessageid = None + db_guid_key = None + db_link_key = (url + u'|' + item["link"]).encode("utf-8") + # check if there's a guid too - if that exists and we match the md5, # return if item.has_key("guid"): - if db.has_key(url + "|" + item["guid"]): - data = db[url + "|" + item["guid"]] + db_guid_key = (url + u'|' + item["guid"]).encode("utf-8") + if db.has_key(db_guid_key): + data = db[db_guid_key] data = cgi.parse_qs(data) if data["contentmd5"][0] == md5sum: continue - if db.has_key(url + "|" + item["link"]): - data = db[url + "|" + item["link"]] + if db.has_key(db_link_key): + data = db[db_link_key] data = cgi.parse_qs(data) if data.has_key("message-id"): prevmessageid = data["message-id"][0] @@ -712,8 +756,8 @@ def parse_and_deliver(maildir, url, statedir): ]) + "@" + socket.gethostname() + ">" msg.add_header("Message-ID", messageid) msg.set_unixfrom("\"%s\" " %(url)) - msg.add_header("From", "\"%s\" " %(author)) - msg.add_header("To", "\"%s\" " %(url)) + msg.add_header("From", "\"%s\" " %(author.encode("utf-8"))) + msg.add_header("To", "\"%s\" " %(url.encode("utf-8"))) if prevmessageid: msg.add_header("References", prevmessageid) createddate = datetime.datetime.now() \ @@ -724,8 +768,13 @@ def parse_and_deliver(maildir, url, statedir): except: pass msg.add_header("Date", createddate) + msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \ + .strftime("%a, %e %b %Y %T -0000")) subj_gen = HTML2Text() - subj_gen.feed(item["title"].encode("utf-8")) + title = item["title"] + title = re.sub(u'<', u'<', title) + title = re.sub(u'>', u'>', title) + subj_gen.feed(title) msg.add_header("Subject", subj_gen.gettext()) msg.set_default_type("text/plain") @@ -736,7 +785,7 @@ def parse_and_deliver(maildir, url, statedir): item["link"] ) htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8") textparser = HTML2Text() - textparser.feed(content.encode("utf-8")) + textparser.feed(content) textcontent = textparser.gettext() textcontent = "%s\n\nItem URL: %s" %( \ textcontent, \ @@ -773,25 +822,25 @@ def parse_and_deliver(maildir, url, statedir): ("created", createddate), \ ("contentmd5", md5sum) \ )) - db[url + "|" + item["guid"]] = data + db[db_guid_key] = data try: - data = db[url + "|" + item["link"]] + data = db[db_link_key] data = cgi.parse_qs(data) newdata = urllib.urlencode(( \ ("message-id", messageid), \ ("created", data["created"][0]), \ ("contentmd5", data["contentmd5"][0]) \ )) - db[url + "|" + item["link"]] = newdata + db[db_link_key] = newdata except: - db[url + "|" + item["link"]] = data + db[db_link_key] = data else: data = urllib.urlencode(( \ ("message-id", messageid), \ ("created", createddate), \ ("contentmd5", md5sum) \ )) - db[url + "|" + item["link"]] = data + db[db_link_key] = data if headers: data = []