from ConfigParser import SafeConfigParser
from base64 import b64encode
-import md5
+
+import chardet
+
+if sys.version_info[0] == 2 and sys.version_info[1] >= 6:
+ import hashlib as md5
+else:
+ import md5
import cgi
import dbm
+import re
+
from HTMLParser import HTMLParser
class HTML2Text(HTMLParser):
u'THORN': u'Þ',
u'eth': u'ð',
u'ETH': u'Ð',
+ u'mdash': u'—',
+ u'ndash': u'–',
+ u'sect': u'§',
+ u'para': u'¶',
+ u'uarr': u'↑',
+ u'darr': u'↓',
+ u'larr': u'←',
+ u'rarr': u'→',
+ u'dagger': u'†',
+ u'Dagger': u'‡',
+ u'permil': u'‰',
+ u'prod': u'∏',
+ u'infin': u'∞',
+ u'radic': u'√',
+ u'there4': u'∴',
+ u'int': u'∫',
+ u'asymp': u'≈',
+ u'ne': u'≠',
+ u'equiv': '≡',
+ u'le': u'≤',
+ u'ge': u'≥',
+ u'loz': u'⋄',
+ u'sum': u'∑',
+ u'part': u'∂',
+ u'prime': u'′',
+ u'Prime': u'″',
+ u'harr': u'↔',
+ u'micro': u'µ',
+ u'not': u'¬',
+ u'plusmn': u'±',
+ u'divide': u'÷',
+ u'cent': u'¢',
+ u'euro': u'€',
}
blockleveltags = [
u'dt',
u'dd',
u'div',
- #u'blockquote',
+ u'blockquote',
]
liststarttags = [
elif tag_name == u'a':
for attr in attrs:
if attr[0].lower() == u'href':
- self.urls.append(attr[1].decode('utf-8'))
+ self.urls.append(attr[1])
self.curdata = self.curdata + u'`'
self.opentags.append(tag_name)
return
url = u''
for attr in attrs:
if attr[0] == 'alt':
- alt = attr[1].decode('utf-8')
+ if isinstance(attr[1], str):
+ alt = u'%s' %(attr[1])
+ else:
+ alt = attr[1]
elif attr[0] == 'src':
- url = attr[1].decode('utf-8')
+ if isinstance(attr[1], str):
+ url = u'%s' %(attr[1])
+ else:
+ url = attr[1]
if url:
if alt:
if self.images.has_key(alt):
else:
while self.images.has_key(alt):
alt = alt + "_"
- self.images[alt]["url"] = url
+ self.images[alt] = {"url": url}
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
- self.images[alt] = {}
- self.images[alt]["url"] = url
+ self.images[alt] = {"url": url}
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
quote = unicode( \
" ".join(self.curdata.encode("utf-8").strip().split()), \
"utf-8")
- seperator = u'\n' + u' ' * self.indentlevel + u'> '
+ seperator = u'\n' + u' ' * self.indentlevel + u' '
if len(self.text) > 0 and self.text[-1] != u'\n':
self.text = self.text + u'\n'
self.text = self.text \
- + u'> ' \
+ + u' ' \
+ seperator.join( \
textwrap.wrap( \
quote, \
def handle_data(self, data):
if len(self.opentags) == 0:
self.opentags.append(u'p')
- self.curdata = self.curdata + data.decode("utf-8")
+ self.curdata = "%s%s" %(self.curdata, data)
+
+ def handle_charref(self, name):
+ try:
+ entity = unichr(int(name))
+ except:
+ if name[0] == 'x':
+ try:
+ entity = unichr(int('0%s' %(name,), 16))
+ except:
+ entity = u'#%s' %(name,)
+ else:
+ entity = u'#%s' %(name,)
+ self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+ "utf-8")
def handle_entityref(self, name):
entity = name
if HTML2Text.entities.has_key(name):
entity = HTML2Text.entities[name]
- elif name[0] == "#":
- entity = unichr(int(name[1:]))
else:
entity = "&" + name + ";"
self.urls = []
if len(self.images.keys()) > 0:
self.text = self.text + u'\n.. ' \
- + u'.. '.join( \
+ + u'\n.. '.join( \
["|%s| image:: %s" %(a, self.images[a]["url"]) \
for a in self.images.keys()]) + u'\n'
self.images = {}
(type, rest) = urllib.splittype(url)
(host, path) = urllib.splithost(rest)
(host, port) = urllib.splitport(host)
- if port == None:
+ if type == "https":
+ if port == None:
+ port = 443
+ elif port == None:
port = 80
try:
- conn = httplib.HTTPConnection("%s:%s" %(host, port))
+ conn = None
+ if type == "http":
+ conn = httplib.HTTPConnection("%s:%s" %(host, port))
+ else:
+ conn = httplib.HTTPSConnection("%s:%s" %(host, port))
conn.request(method, path)
response = conn.getresponse()
if response.status in [301, 302, 303, 307]:
if item.has_key("content"):
content = item["content"][0]["value"]
else:
- content = item["summary"]
+ if item.has_key("description"):
+ content = item["description"]
+ else:
+ content = u''
md5sum = md5.md5(content.encode("utf-8")).hexdigest()
+ # make sure content is unicode encoded
+ if not isinstance(content, unicode):
+ cd_res = chardet.detect(content)
+ chrset = cd_res['encoding']
+ print "detected charset %s for item %s" %(chrset, item["link"])
+ content = content.decode(chrset)
+
prevmessageid = None
+ db_guid_key = None
+ if not item.has_key("link"):
+ item["link"] = u'#' + md5sum
+ db_link_key = (url + u'|' + item["link"]).encode("utf-8")
+
# check if there's a guid too - if that exists and we match the md5,
# return
if item.has_key("guid"):
- if db.has_key(url + "|" + item["guid"]):
- data = db[url + "|" + item["guid"]]
+ db_guid_key = (url + u'|' + item["guid"]).encode("utf-8")
+ if db.has_key(db_guid_key):
+ data = db[db_guid_key]
data = cgi.parse_qs(data)
if data["contentmd5"][0] == md5sum:
continue
- if db.has_key(url + "|" + item["link"]):
- data = db[url + "|" + item["link"]]
+ if db.has_key(db_link_key):
+ data = db[db_link_key]
data = cgi.parse_qs(data)
if data.has_key("message-id"):
prevmessageid = data["message-id"][0]
]) + "@" + socket.gethostname() + ">"
msg.add_header("Message-ID", messageid)
msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
- msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
- msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
+ msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author.encode("utf-8")))
+ msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url.encode("utf-8")))
if prevmessageid:
msg.add_header("References", prevmessageid)
createddate = datetime.datetime.now() \
except:
pass
msg.add_header("Date", createddate)
+ msg.add_header("X-rss2maildir-rundate", datetime.datetime.now() \
+ .strftime("%a, %e %b %Y %T -0000"))
subj_gen = HTML2Text()
- subj_gen.feed(item["title"].encode("utf-8"))
+ title = item["title"]
+ title = re.sub(u'<', u'<', title)
+ title = re.sub(u'>', u'>', title)
+ subj_gen.feed(title)
msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
item["link"] )
htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
textparser = HTML2Text()
- textparser.feed(content.encode("utf-8"))
+ textparser.feed(content)
textcontent = textparser.gettext()
textcontent = "%s\n\nItem URL: %s" %( \
textcontent, \
("created", createddate), \
("contentmd5", md5sum) \
))
- db[url + "|" + item["guid"]] = data
+ db[db_guid_key] = data
try:
- data = db[url + "|" + item["link"]]
+ data = db[db_link_key]
data = cgi.parse_qs(data)
newdata = urllib.urlencode(( \
("message-id", messageid), \
("created", data["created"][0]), \
("contentmd5", data["contentmd5"][0]) \
))
- db[url + "|" + item["link"]] = newdata
+ db[db_link_key] = newdata
except:
- db[url + "|" + item["link"]] = data
+ db[db_link_key] = data
else:
data = urllib.urlencode(( \
("message-id", messageid), \
("created", createddate), \
("contentmd5", md5sum) \
))
- db[url + "|" + item["link"]] = data
+ db[db_link_key] = data
if headers:
data = []