[Zope] Malicious HTML remover and HTML to Text converter

Andy McKay andym@ActiveState.com
Wed, 4 Apr 2001 10:30:01 -0700


Thanks, very useful.
--
  Andy McKay.


----- Original Message -----
From: "Chris Withers" <chrisw@nipltd.com>
To: "Farrell, Troy" <troy.farrell@wilcom.com>; "Steve Drees"
<drees@the-bridge.net>; <zope@zope.org>
Sent: Wednesday, April 04, 2001 9:54 AM
Subject: [Zope] Malicious HTML remover and HTML to Text converter


> Okay,
>
> Coupla people asked for this, so here goes:
>
> Use as follows:
>
> from stripogram import html2text, html2safehtml
>
>    mylumpofdodgyhtml # a lump of dodgy html ;-)
>
>    mylumpofcoolcleancollectedhtml =
> html2safehtml(mylumpofdodgyhtml,valid_tags=('b', 'a', 'i', 'br', 'p'))
>
>    mylumpoftext = html2text(mylumpofcoolcleancollectedhtml)
>
> cheers,
>
> Chris
>
> PS: Patches to the parser used in html2text greatfully recieved ;-)


----------------------------------------------------------------------------
----


> __doc__ = """HTML filter thanks to Itamar Shtull-Trauring"""
>
> import sgmllib, string
>
> class HTML2Text(sgmllib.SGMLParser):
>
>     from htmlentitydefs import entitydefs # replace entitydefs from
sgmllib
>
>     def __init__(self):
>         sgmllib.SGMLParser.__init__(self)
>         self.result = ""
>         self.indent = 0
>         self.ol_number = 0
>
>     def add_line(self,text,newline='\n'):
>         self.result = self.result + self.indent*'   ' + text + newline
>
>     def mod_indent(self,i):
>         self.indent = self.indent + i
>         if self.indent < 0:
>             self.indent = 0
>
>     def handle_data(self, data):
>         if data:
>             map(self.add_line,string.split(string.strip(data),'\n'))
>
>     def unknown_starttag(self, tag, attrs):
>         """ Convert HTML to something meaningful in plain text """
>         tag = string.lower(tag)
>
>         if tag[0]=='h' or tag in ['br','pre','p','hr']:
>             # insert a blank line
>             self.add_line('')
>
>         elif tag =='img':
>             # newline, text, newline
>             src = ''
>
>             for k, v in attrs:
>                 if string.lower(k) == 'src':
>                     src = v
>
>             self.add_line('')
>             self.add_line('Image: %s' % src)
>
>         elif tag =='li':
>             self.add_line('')
>             if self.ol_number:
>                 # num - text
>                 self.add_line('%s - ' % self.ol_number,'')
>                 self.ol_number = self.ol_number + 1
>             else:
>                 # - text
>                 self.add_line('- ','')
>
>         elif tag in ['dd','dt']:
>             self.add_line('')
>             # increase indent
>             self.mod_indent(+1)
>
>         elif tag in ['ul','dl','ol']:
>             # blank line
>             #self.add_line('')
>             # increase indent
>             self.mod_indent(+1)
>             if tag=='ol':
>                 self.ol_number = 1
>
>     def unknown_endtag(self, tag):
>         """ Convert HTML to something meaningful in plain text """
>         tag = string.lower(tag)
>
>         if tag[0]=='h' or tag in ['pre']:
>             # newline, text, newline
>             self.add_line('')
>
>         elif tag =='li':
>             #self.add_line('')
>             pass
>
>         elif tag in ['dd','dt']:
>             #self.add_line('')
>             # descrease indent
>             self.mod_indent(-1)
>
>         elif tag in ['ul','dl','ol']:
>             # blank line
>             #self.add_line('')
>             # decrease indent
>             self.mod_indent(-1)
>             self.ol_number = 0
>
> class StrippingParser(sgmllib.SGMLParser):
>
>     from htmlentitydefs import entitydefs # replace entitydefs from
sgmllib
>
>     def __init__(self):
>         sgmllib.SGMLParser.__init__(self)
>         self.result = ""
>         self.endTagList = []
>
>     def handle_data(self, data):
>         if data:
>             self.result = self.result + data
>
>     def handle_charref(self, name):
>         self.result = "%s&#%s;" % (self.result, name)
>
>     def handle_entityref(self, name):
>         if self.entitydefs.has_key(name):
>             x = ';'
>         else:
>             # this breaks unstandard entities that end with ';'
>             x = ''
>         self.result = "%s&%s%s" % (self.result, name, x)
>
>     def unknown_starttag(self, tag, attrs):
>         """ Delete all tags except for legal ones """
>         if string.lower(tag) in self.valid_tags:
>             self.result = self.result + '<' + tag
>             for k, v in attrs:
>                 if string.lower(k[0:2]) != 'on' and string.lower(v[0:10])
!= 'javascript':
>                     self.result = '%s %s="%s"' % (self.result, k, v)
>             endTag = '</%s>' % tag
>             self.endTagList.insert(0,endTag)
>             self.result = self.result + '>'
>
>     def unknown_endtag(self, tag):
>         if string.lower(tag) in self.valid_tags:
>             self.result = "%s</%s>" % (self.result, tag)
>             remTag = '</%s>' % tag
>             self.endTagList.remove(remTag)
>
>     def cleanup(self):
>         """ Append missing closing tags """
>         for j in range(len(self.endTagList)):
>                 self.result = self.result + self.endTagList[j]
>
> def html2text(self,s):
>     parser = striphtml.HTML2Text()
>     parser.feed(s)
>     parser.close()
>     return parser.result
>
> def html2safehtml(s, valid_tags=('b', 'a', 'i', 'br', 'p')):
>
>     parser = StrippingParser()
>     parser.valid_tags = valid_tags
>     parser.feed(s)
>     parser.close()
>     parser.cleanup()
>     return parser.result
>