projects
/
rss2maildir.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Fix some entity handling
[rss2maildir.git]
/
rss2maildir.py
diff --git
a/rss2maildir.py
b/rss2maildir.py
index df7236a2bcd65421c6ecf5f69a7b429421ceb773..9473dd009e3d5ebead31933d7ba110abd4202162 100755
(executable)
--- a/
rss2maildir.py
+++ b/
rss2maildir.py
@@
-137,6
+137,39
@@
class HTML2Text(HTMLParser):
u'THORN': u'Þ',
u'eth': u'ð',
u'ETH': u'Ð',
u'THORN': u'Þ',
u'eth': u'ð',
u'ETH': u'Ð',
+ u'mdash': u'—',
+ u'ndash': u'–',
+ u'sect': u'§',
+ u'para': u'¶',
+ u'uarr': u'↑',
+ u'darr': u'↓',
+ u'larr': u'←',
+ u'rarr': u'→',
+ u'dagger': u'†',
+ u'Dagger': u'‡',
+ u'permil': u'‰',
+ u'prod': u'∏',
+ u'infin': u'∞',
+ u'radic': u'√',
+ u'there4': u'∴',
+ u'int': u'∫',
+ u'asymp': u'≈',
+ u'ne': u'≠',
+ u'equiv': '≡',
+ u'le': u'≤',
+ u'ge': u'≥',
+ u'loz': u'⋄',
+ u'sum': u'∑',
+ u'part': u'∂',
+ u'prime': u'′',
+ u'Prime': u'″',
+ u'harr': u'↔',
+ u'micro': u'µ',
+ u'not': u'¬',
+ u'plusmn': u'±',
+ u'divide': u'÷',
+ u'cent': u'¢',
+ u'euro': u'€',
}
blockleveltags = [
}
blockleveltags = [
@@
-155,7
+188,7
@@
class HTML2Text(HTMLParser):
u'dt',
u'dd',
u'div',
u'dt',
u'dd',
u'div',
-
#
u'blockquote',
+ u'blockquote',
]
liststarttags = [
]
liststarttags = [
@@
-197,6
+230,9
@@
class HTML2Text(HTMLParser):
self.listcount.append(1)
self.listlevel = len(self.listcount) - 1
self.listcount.append(1)
self.listlevel = len(self.listcount) - 1
+ if tag_name == u'dl':
+ self.indentlevel = self.indentlevel + 4
+
if tag_name in self.liststarttags:
smallist = self.opentags[-3:-1]
smallist.reverse()
if tag_name in self.liststarttags:
smallist = self.opentags[-3:-1]
smallist.reverse()
@@
-277,12
+313,11
@@
class HTML2Text(HTMLParser):
else:
while self.images.has_key(alt):
alt = alt + "_"
else:
while self.images.has_key(alt):
alt = alt + "_"
- self.images[alt]
["url"] = url
+ self.images[alt]
= {"url": url}
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
- self.images[alt] = {}
- self.images[alt]["url"] = url
+ self.images[alt] = {"url": url}
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
self.curdata = self.curdata \
+ u'|%s|' %(alt,)
else:
@@
-372,11
+407,11
@@
class HTML2Text(HTMLParser):
quote = unicode( \
" ".join(self.curdata.encode("utf-8").strip().split()), \
"utf-8")
quote = unicode( \
" ".join(self.curdata.encode("utf-8").strip().split()), \
"utf-8")
- seperator = u'\n' + u' ' * self.indentlevel + u'
>
'
+ seperator = u'\n' + u' ' * self.indentlevel + u'
'
if len(self.text) > 0 and self.text[-1] != u'\n':
self.text = self.text + u'\n'
self.text = self.text \
if len(self.text) > 0 and self.text[-1] != u'\n':
self.text = self.text + u'\n'
self.text = self.text \
- + u'
>
' \
+ + u'
' \
+ seperator.join( \
textwrap.wrap( \
quote, \
+ seperator.join( \
textwrap.wrap( \
quote, \
@@
-430,12
+465,12
@@
class HTML2Text(HTMLParser):
self.text = self.text + u'\n\n'
elif len(self.text) > 1 and self.text[-2] != u'\n':
self.text = self.text + u'\n'
self.text = self.text + u'\n\n'
elif len(self.text) > 1 and self.text[-2] != u'\n':
self.text = self.text + u'\n'
- definition = u' ' *
self.indentlevel
+ definition + "::"
- indentstring = u'\n' + u' ' * (self.indentlevel
+ 1
)
+ definition = u' ' *
(self.indentlevel - 4)
+ definition + "::"
+ indentstring = u'\n' + u' ' * (self.indentlevel
- 3
)
self.text = self.text \
+ indentstring.join(
textwrap.wrap(definition, \
self.text = self.text \
+ indentstring.join(
textwrap.wrap(definition, \
- self.textwidth - self.indentlevel -
1
))
+ self.textwidth - self.indentlevel -
4
))
self.curdata = u''
elif tag_thats_done == u'dd':
definition = unicode(" ".join( \
self.curdata = u''
elif tag_thats_done == u'dd':
definition = unicode(" ".join( \
@@
-444,13
+479,13
@@
class HTML2Text(HTMLParser):
if len(definition) > 0:
if len(self.text) > 0 and self.text[-1] != u'\n':
self.text = self.text + u'\n'
if len(definition) > 0:
if len(self.text) > 0 and self.text[-1] != u'\n':
self.text = self.text + u'\n'
- indentstring = u'\n' + u' ' *
(self.indentlevel + 4)
+ indentstring = u'\n' + u' ' *
self.indentlevel
self.text = self.text \
self.text = self.text \
- +
u' ' * (self.indentlevel + 4)
\
+ +
indentstring
\
+ indentstring.join( \
textwrap.wrap( \
definition, \
+ indentstring.join( \
textwrap.wrap( \
definition, \
- self.textwidth - self.indentlevel
- 4
\
+ self.textwidth - self.indentlevel \
) \
)
self.curdata = u''
) \
)
self.curdata = u''
@@
-479,8
+514,11
@@
class HTML2Text(HTMLParser):
if tag in [u'br', u'img']:
return
if tag in [u'br', u'img']:
return
+ if tag == u'dl':
+ self.indentlevel = self.indentlevel - 4
+
if tag in self.liststarttags:
if tag in self.liststarttags:
- if tag in [u'ol', u'dl', u'ul']:
+ if tag in [u'ol', u'dl', u'ul'
, u'dd'
]:
self.handle_curdata()
# find if there was a previous list level
smalllist = self.opentags[:-1]
self.handle_curdata()
# find if there was a previous list level
smalllist = self.opentags[:-1]
@@
-518,16
+556,20
@@
class HTML2Text(HTMLParser):
self.opentags.append(u'p')
self.curdata = self.curdata + data.decode("utf-8")
self.opentags.append(u'p')
self.curdata = self.curdata + data.decode("utf-8")
+ def handle_charref(self, name):
+ entity = unichr(int(name))
+ self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+ "utf-8")
+
def handle_entityref(self, name):
entity = name
def handle_entityref(self, name):
entity = name
- if HTML2Text.entities.has_key(name.lower()):
- entity = HTML2Text.entities[name.lower()]
- elif name[0] == "#":
- entity = unichr(int(name[1:]))
+ if HTML2Text.entities.has_key(name):
+ entity = HTML2Text.entities[name]
else:
entity = "&" + name + ";"
else:
entity = "&" + name + ";"
- self.curdata = self.curdata + unicode(entity, "utf-8")
+ self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+ "utf-8")
def gettext(self):
self.handle_curdata()
def gettext(self):
self.handle_curdata()
@@
-543,7
+585,7
@@
class HTML2Text(HTMLParser):
self.urls = []
if len(self.images.keys()) > 0:
self.text = self.text + u'\n.. ' \
self.urls = []
if len(self.images.keys()) > 0:
self.text = self.text + u'\n.. ' \
- + u'.. '.join( \
+ + u'
\n
.. '.join( \
["|%s| image:: %s" %(a, self.images[a]["url"]) \
for a in self.images.keys()]) + u'\n'
self.images = {}
["|%s| image:: %s" %(a, self.images[a]["url"]) \
for a in self.images.keys()]) + u'\n'
self.images = {}
@@
-682,7
+724,9
@@
def parse_and_deliver(maildir, url, statedir):
except:
pass
msg.add_header("Date", createddate)
except:
pass
msg.add_header("Date", createddate)
- msg.add_header("Subject", item["title"])
+ subj_gen = HTML2Text()
+ subj_gen.feed(item["title"].encode("utf-8"))
+ msg.add_header("Subject", subj_gen.gettext())
msg.set_default_type("text/plain")
htmlcontent = content.encode("utf-8")
msg.set_default_type("text/plain")
htmlcontent = content.encode("utf-8")