- if tag.lower() == "h1":
- self.inheadingone = True
- self.inparagraph = False
- elif tag.lower() == "h2":
- self.inheadingtwo = True
- self.inparagraph = False
- elif tag.lower() in ["h3", "h4", "h5", "h6"]:
- self.inotherheading = True
- self.inparagraph = False
- elif tag.lower() == "a":
- self.inlink = True
- elif tag.lower() == "br":
- if self.inparagraph:
- self.text = self.text \
- + u'\n'.join( \
- textwrap.wrap(self.currentparagraph, 70)) \
- + u'\n'
- self.currentparagraph = ""
- elif self.inblockquote:
- self.text = self.text \
- + u'\n> ' \
- + u'\n> '.join( \
- [a.strip() \
- for a in textwrap.wrap(self.blockquote, 68) \
- ]) \
- + u'\n'
- self.blockquote = u''
- else:
- self.text = self.text + u'\n'
- elif tag.lower() == "blockquote":
- self.inblockquote = True
- self.text = self.text + u'\n'
- elif tag.lower() == "p":
- if self.text != "":
- self.text = self.text + u'\n\n'
- if self.inparagraph:
- self.text = self.text \
- + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
- self.currentparagraph = u''
- self.inparagraph = True
- elif tag.lower() == "pre":
- self.text = self.text + "\n"
- self.inpre = True
- self.inparagraph = False
- self.inblockquote = False
- elif tag.lower() == "ul":
- self.item = u''
- self.inul = True
- self.text = self.text + "\n"
- elif tag.lower() == "li" and self.inul:
- if not self.initem:
- self.initem = True
- self.item = u''
+ tag_name = tag.lower()
+ if tag_name in self.blockleveltags:
+ # handle starting a new block - unless we're in a block element
+ # that can contain other blocks, we'll assume that we want to close
+ # the container
+ if len(self.opentags) > 1 and self.opentags[-1] == u'li':
+ self.handle_curdata()
+
+ if tag_name == u'ol':
+ self.handle_curdata()
+ self.listcount.append(1)
+ self.listlevel = len(self.listcount) - 1
+
+ if tag_name in self.liststarttags:
+ smallist = self.opentags[-3:-1]
+ smallist.reverse()
+ for prev_listtag in smallist:
+ if prev_listtag in [u'dl', u'ol']:
+ self.indentlevel = self.indentlevel + 4
+ break
+ elif prev_listtag == u'ul':
+ self.indentlevel = self.indentlevel + 3
+ break
+
+ if len(self.opentags) > 0:
+ self.handle_curdata()
+ if tag_name not in self.cancontainflow:
+ self.opentags.pop()
+ self.opentags.append(tag_name)
+ else:
+ if tag_name == "span":
+ return
+ listcount = 0
+ try:
+ listcount = self.listcount[-1]
+ except:
+ pass
+
+ if tag_name == u'dd' and len(self.opentags) > 1 \
+ and self.opentags[-1] == u'dt':
+ self.handle_curdata()
+ self.opentags.pop()
+ elif tag_name == u'dt' and len(self.opentags) > 1 \
+ and self.opentags[-1] == u'dd':
+ self.handle_curdata()
+ self.opentags.pop()
+ elif tag_name == u'a':
+ for attr in attrs:
+ if attr[0].lower() == u'href':
+ self.urls.append(attr[1].decode('utf-8'))
+ self.curdata = self.curdata + u'`'
+ self.opentags.append(tag_name)
+ return
+ elif tag_name == u'img':
+ self.handle_image(attrs)
+ return
+ elif tag_name == u'br':
+ self.handle_br()
+ return