small fix to put images on seperate lines
[rss2maildir.git] / rss2maildir.py
index df7236a2bcd65421c6ecf5f69a7b429421ceb773..dce6c3c7a6c9afb7578887f8296575a0b32bcb42 100755 (executable)
@@ -197,6 +197,9 @@ class HTML2Text(HTMLParser):
                 self.listcount.append(1)
                 self.listlevel = len(self.listcount) - 1
 
+            if tag_name == u'dl':
+                self.indentlevel = self.indentlevel + 4
+
             if tag_name in self.liststarttags:
                 smallist = self.opentags[-3:-1]
                 smallist.reverse()
@@ -430,12 +433,12 @@ class HTML2Text(HTMLParser):
                 self.text = self.text + u'\n\n'
             elif len(self.text) > 1 and self.text[-2] != u'\n':
                 self.text = self.text + u'\n'
-            definition = u' ' * self.indentlevel + definition + "::"
-            indentstring = u'\n' + u' ' * (self.indentlevel + 1)
+            definition = u' ' * (self.indentlevel - 4) + definition + "::"
+            indentstring = u'\n' + u' ' * (self.indentlevel - 3)
             self.text = self.text \
                 + indentstring.join(
                     textwrap.wrap(definition, \
-                        self.textwidth - self.indentlevel - 1))
+                        self.textwidth - self.indentlevel - 4))
             self.curdata = u''
         elif tag_thats_done == u'dd':
             definition = unicode(" ".join( \
@@ -444,13 +447,13 @@ class HTML2Text(HTMLParser):
             if len(definition) > 0:
                 if len(self.text) > 0 and self.text[-1] != u'\n':
                     self.text = self.text + u'\n'
-                indentstring = u'\n' + u' ' * (self.indentlevel + 4)
+                indentstring = u'\n' + u' ' * self.indentlevel
                 self.text = self.text \
-                    + u' ' * (self.indentlevel + 4) \
+                    + indentstring \
                     + indentstring.join( \
                         textwrap.wrap( \
                             definition, \
-                            self.textwidth - self.indentlevel - 4 \
+                            self.textwidth - self.indentlevel \
                             ) \
                         )
                 self.curdata = u''
@@ -479,8 +482,11 @@ class HTML2Text(HTMLParser):
         if tag in [u'br', u'img']:
             return
 
+        if tag == u'dl':
+            self.indentlevel = self.indentlevel - 4
+
         if tag in self.liststarttags:
-            if tag in [u'ol', u'dl', u'ul']:
+            if tag in [u'ol', u'dl', u'ul', u'dd']:
                 self.handle_curdata()
                 # find if there was a previous list level
                 smalllist = self.opentags[:-1]
@@ -520,14 +526,15 @@ class HTML2Text(HTMLParser):
 
     def handle_entityref(self, name):
         entity = name
-        if HTML2Text.entities.has_key(name.lower()):
-            entity = HTML2Text.entities[name.lower()]
+        if HTML2Text.entities.has_key(name):
+            entity = HTML2Text.entities[name]
         elif name[0] == "#":
             entity = unichr(int(name[1:]))
         else:
             entity = "&" + name + ";"
 
-        self.curdata = self.curdata + unicode(entity, "utf-8")
+        self.curdata = self.curdata + unicode(entity.encode('utf-8'), \
+            "utf-8")
 
     def gettext(self):
         self.handle_curdata()
@@ -543,7 +550,7 @@ class HTML2Text(HTMLParser):
             self.urls = []
         if len(self.images.keys()) > 0:
             self.text = self.text + u'\n.. ' \
-                + u'.. '.join( \
+                + u'\n.. '.join( \
                     ["|%s| image:: %s" %(a, self.images[a]["url"]) \
                 for a in self.images.keys()]) + u'\n'
             self.images = {}
@@ -682,7 +689,9 @@ def parse_and_deliver(maildir, url, statedir):
         except:
             pass
         msg.add_header("Date", createddate)
-        msg.add_header("Subject", item["title"])
+        subj_gen = HTML2Text()
+        subj_gen.feed(item["title"].encode("utf-8"))
+        msg.add_header("Subject", subj_gen.gettext())
         msg.set_default_type("text/plain")
 
         htmlcontent = content.encode("utf-8")