projects
/
rss2maildir.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Patch from MJ Ray for items without link
[rss2maildir.git]
/
rss2maildir.py
diff --git
a/rss2maildir.py
b/rss2maildir.py
index b4acdec086d924bb9175c51dca551caf9ab07913..314a9c19ce4b881ea62fcd28bc34899322980240 100755
(executable)
--- a/
rss2maildir.py
+++ b/
rss2maildir.py
@@
-40,6
+40,8
@@
from ConfigParser import SafeConfigParser
from base64 import b64encode
from base64 import b64encode
+import chardet
+
if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
import hashlib as md5
else:
if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
import hashlib as md5
else:
@@
-275,7
+277,7
@@
class HTML2Text(HTMLParser):
elif tag_name == u'a':
for attr in attrs:
if attr[0].lower() == u'href':
elif tag_name == u'a':
for attr in attrs:
if attr[0].lower() == u'href':
- self.urls.append(attr[1]
.decode('utf-8')
)
+ self.urls.append(attr[1])
self.curdata = self.curdata + u'`'
self.opentags.append(tag_name)
return
self.curdata = self.curdata + u'`'
self.opentags.append(tag_name)
return
@@
-307,9
+309,15
@@
class HTML2Text(HTMLParser):
url = u''
for attr in attrs:
if attr[0] == 'alt':
url = u''
for attr in attrs:
if attr[0] == 'alt':
- alt = attr[1].decode('utf-8')
+ if isinstance(attr[1], str):
+ alt = u'%s' %(attr[1])
+ else:
+ alt = attr[1]
elif attr[0] == 'src':
elif attr[0] == 'src':
- url = attr[1].decode('utf-8')
+ if isinstance(attr[1], str):
+ url = u'%s' %(attr[1])
+ else:
+ url = attr[1]
if url:
if alt:
if self.images.has_key(alt):
if url:
if alt:
if self.images.has_key(alt):
@@
-560,7
+568,7
@@
class HTML2Text(HTMLParser):
def handle_data(self, data):
if len(self.opentags) == 0:
self.opentags.append(u'p')
def handle_data(self, data):
if len(self.opentags) == 0:
self.opentags.append(u'p')
- self.curdata =
self.curdata + data.decode("utf-8"
)
+ self.curdata =
"%s%s" %(self.curdata, data
)
def handle_charref(self, name):
try:
def handle_charref(self, name):
try:
@@
-701,9
+709,18
@@
def parse_and_deliver(maildir, url, statedir):
md5sum = md5.md5(content.encode("utf-8")).hexdigest()
md5sum = md5.md5(content.encode("utf-8")).hexdigest()
+ # make sure content is unicode encoded
+ if not isinstance(content, unicode):
+ cd_res = chardet.detect(content)
+ chrset = cd_res['encoding']
+ print "detected charset %s for item %s" %(chrset, item["link"])
+ content = content.decode(chrset)
+
prevmessageid = None
db_guid_key = None
prevmessageid = None
db_guid_key = None
+ if not item.has_key("link"):
+ item["link"] = u'#' + md5sum
db_link_key = (url + u'|' + item["link"]).encode("utf-8")
# check if there's a guid too - if that exists and we match the md5,
db_link_key = (url + u'|' + item["link"]).encode("utf-8")
# check if there's a guid too - if that exists and we match the md5,
@@
-741,8
+758,8
@@
def parse_and_deliver(maildir, url, statedir):
]) + "@" + socket.gethostname() + ">"
msg.add_header("Message-ID", messageid)
msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
]) + "@" + socket.gethostname() + ">"
msg.add_header("Message-ID", messageid)
msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
- msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
- msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
+ msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author
.encode("utf-8")
))
+ msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url
.encode("utf-8")
))
if prevmessageid:
msg.add_header("References", prevmessageid)
createddate = datetime.datetime.now() \
if prevmessageid:
msg.add_header("References", prevmessageid)
createddate = datetime.datetime.now() \
@@
-759,7
+776,7
@@
def parse_and_deliver(maildir, url, statedir):
title = item["title"]
title = re.sub(u'<', u'<', title)
title = re.sub(u'>', u'>', title)
title = item["title"]
title = re.sub(u'<', u'<', title)
title = re.sub(u'>', u'>', title)
- subj_gen.feed(title
.encode("utf-8")
)
+ subj_gen.feed(title)
msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
@@
-770,7
+787,7
@@
def parse_and_deliver(maildir, url, statedir):
item["link"] )
htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
textparser = HTML2Text()
item["link"] )
htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
textparser = HTML2Text()
- textparser.feed(content
.encode("utf-8")
)
+ textparser.feed(content)
textcontent = textparser.gettext()
textcontent = "%s\n\nItem URL: %s" %( \
textcontent, \
textcontent = textparser.gettext()
textcontent = "%s\n\nItem URL: %s" %( \
textcontent, \