[Zope-CVS] CVS: Packages/HTMLStructure - Parser.py:1.1 Printer.py:1.1 Validator.py:1.1 __init__.py:1.1

Evan Simpson evan@zope.com
Wed, 23 Jan 2002 10:57:57 -0500


Update of /cvs-repository/Packages/HTMLStructure
In directory cvs.zope.org:/tmp/cvs-serv29501

Added Files:
	Parser.py Printer.py Validator.py __init__.py 
Log Message:
Initial checkin.


=== Added File Packages/HTMLStructure/Parser.py ===
from mx.TextTools import *

# Utility functions

from bisect import bisect
def getLines(text):
    def add(taglist,text,l,r,subtags):
        taglist.append(r)
    pt_lines = (
        (None, AllNotIn, '\n', +1),
        (add, Is + CallTag, '\n', MatchOk, -1),
        )
    return tag(text, pt_lines)[1]

class ParsedText:
    def __init__(self, text, parseTables=None):
        self.text = text
        if parseTables is None:
            parseTables = ParseTables()
        self.parse = tag(text, parseTables.pt_html)
        self.linemap = getLines(text)

    def lineOfChar(self, index):
        '''Convert a character index into a line number.'''
        return bisect(self.linemap, index)

    def elementText(self, element):
        '''Get the text for a parse element.'''
        return self.text[element[1]:element[2]]

    def tagName(self, tag):
        '''Get the name of a tag parse element.'''
        return self.elementText(tag[3][0][3][0]).lower()

    def tagType(self, tag, is_empty=None):
        '''Get the open/close/empty type of a tag.

        Return None for elements that are not tags, or not one of
        these three types of tag.
        '''
        if tag[0] != 'tag':
            return
        sub = tag[3][0]
        # The first sub-element of a tag is the type.
        ttype = sub[0]
        # We only care about opening, closing, and empty tags
        if ttype == 'close':
            return ttype
        if ttype == 'open':
            # Check whether an open tag is empty
            if (sub[3][-1][0] == 'empty' or
                (is_empty and is_empty(self.tagName(tag))) ):
                return 'empty'
            return ttype

# Parse HTML.  Needs to be read from bottom to top.

hexadecimal_set = set(number + 'abcdefABCDEF')

namestart_set = set(alpha + ':_')
name_set = set(alphanumeric + '.-_:')

basic_ws_set = set(' \t\r\n')
opt_ws = (None, AllInSet, basic_ws_set, +1)
req_ws = (None, AllInSet, basic_ws_set)

class ParseTables:
    def __init__(self, handle_map=None):
        '''Transform the parse table tuples so that they are linked,
        and insert specified handlers.

        "handle_map" must be a dictionary with keys of the form
        (tablename, tuplename).  The values will replace the first
        element of the named tuples, and cause them to be used as
        callbacks if they are callable.
        '''
        if handle_map is None:
            handle_map = {}
        todo = {}
        for k, v in self.__class__.__dict__.items():
            if k.startswith('pt_'):
                todo[k] = v
        done = {}
        for k in todo.keys():
            self.process(k, todo, {}, done, handle_map)
            setattr(self, k, done[k])

    def process(self, current, todo, working, done, handle_map):
        if done.get(current) is not None:
            return
        if working.get(current):
            raise ValueError, ('Circular table reference found at %s'
                               % `current`)
        working[current] = 1
        new_tuples = []
        for tup in todo[current]:
            handler = handle_map.get((current, tup[0]))
            extra = 0
            if callable(handler):
                extra = CallTag
            if handler:
                tup = (handler, tup[1] + extra) + tup[2:]
            if (tup[1] & 0xff) in (Table, SubTable):
                table = tup[2]
                if isinstance(table, type('')):
                    self.process(tup[2], todo, working, done, handle_map)
                    tup = tup[:2] + (done[tup[2]],) + tup[3:]
            new_tuples.append(tup)
        done[current] = tuple(new_tuples)
        working[current] = 0
        
    pt_name = (
        (None, IsInSet, namestart_set),
        (None, AllInSet, name_set, +1),
        )

    pt_charref = (
        # Required character entity ref start
        (None, Is, '&'),
    
        # Try numeric
        (None, Is, '#', +4),
        # Is numeric, try Hex
        (None, IsIn, 'xX', +2),
        ('hex', AllInSet, hexadecimal_set, +3, +4),
        # Is numeric, try Decimal
        ('digits', AllInSet, number_set, +2, +3),

        # Try symbolic
        ('name', Table, 'pt_name', +1, +2),

        # Note an error, but don't fail to match
        ('error', Skip, 0),

        # Optional ';' at the end
        (None, Is, ';', +1, MatchOk),
        # If not explicitly terminated, reject name chars
        ('error', IsInSet, name_set, +1),
        )

    pt_attrvalue1 = (
        # Attribute value in double quotes
        ('quote', Is, '"'),
        # Stop at '&' or end of string
        (None, AllNotIn, '&"', +1),
        (None, Is + LookAhead, '&', +2),
        # Handle an entity ref
        ('charref', Table, 'pt_charref', -2, -2),
        (None, Is, '"'),
        )

    pt_attrvalue2 = (
        # Attribute value in single quotes
        ('quote', Is, "'"),
        # Stop at '&' or end of string
        (None, AllNotIn, "&'", +1),
        (None, Is + LookAhead, '&', +2),
        # Handle an entity ref
        ('charref', Table, 'pt_charref', -2, -2),
        (None, Is, "'"),
        )

    pt_attrvalue3 = (
        # Bare attribute value
        ('quote', Skip, 0),
        # Stop at '&' or end of string
        (None, AllIn, alphanumeric + '-./:;+*%?!$()_#=~'),
        (None, Is + LookAhead, '&', MatchOk),
        # Handle an entity ref
        ('charref', Table, 'pt_charref', -2, -2),
        )

    pt_attribute1 = (
        ('name', Table, 'pt_name'),
        opt_ws,
        (None, Is, '='),
        opt_ws,
        # Try double, then single quoted value
        ('value', Table, 'pt_attrvalue1', +1, MatchOk),
        ('value', Table, 'pt_attrvalue2', +1, MatchOk),
        ('value', Table, 'pt_attrvalue3'),
        )

    pt_attribute2 = (
        ('name', Table, 'pt_name'),
        )

    pt_attributes = (
        opt_ws, #should be req_ws, or complain if ws is missing
        # Try attribute with value first, then plain name
        ('attr', Table, 'pt_attribute1', +1, +2),
        ('attr', Table, 'pt_attribute2'),
        # Optional additional attributes
        (None, SubTable, ThisTable, MatchOk),
        (None, Jump, To, -1),
        )

    pt_open_tag = (
        ('name', Table, 'pt_name'),
        ('attrs', Table, 'pt_attributes', +1),
        opt_ws,
        # Empty tag?
        ('empty', Is, '/', +1),
        # Close it.
        (None, Is, '>')
        )

    pt_close_tag = (
        # Required close tag start
        (None, Is, '/'),
        ('name', Table, 'pt_name'),
        opt_ws,
        (None, Is, '>'),
        )

    pt_comment = (
        # Required comment start
        (None, Word, '!--'),
        (None, AllNotIn, '-', +1),
        (None, Is, '-'),
        # After a hyphen, require a non-hyphen or the end
        (None, IsNot, '-', +1, -2),
        (None, Word, '->'),
        )

    pt_directive = (
        # Required directive start
        (None, Is, '!'),
        ('type', AllInSet, A2Z_set),
        # Bogus and incomplete!
        (None, WordEnd, '>'),
        )

    pt_PI = (
        (None, Is, '?'),
        ('target', Table, 'pt_name'),
        (None, WordEnd, '?>'),
        )

    pt_tag = (
        # Required tag start
        (None, Is, '<'),
        # Try open tag
        ('open', Table, 'pt_open_tag', +1, MatchOk),
        # Try close tag
        ('close', Table, 'pt_close_tag', +1, MatchOk),
        # Try comment
        ('comment', Table, 'pt_comment', +1, MatchOk),
        # Try directive
        ('directive', Table, 'pt_directive', +1, MatchOk),
        # Finally, try PI
        ('PI', Table, 'pt_PI'),
        )

    pt_text = (
        # grab plain text
        (None, AllNotIn, '&<', +1),
        # if we hit a tag, we're done.
        (None, Is + LookAhead, '<', +1, MatchOk),
        # We must have an '&', or be done.
        (None, Is + LookAhead, '&', MatchOk),
        ('charref', Table, 'pt_charref', -3, -3)
        )

    # Main tag for HTML fragment parsing.  The resulting parse structure
    # highlights entity references, and all kinds of tags.  Intervening
    # text spans can easily be inferred.

    pt_html = (
        # grab text up to '<' or eof
        (None, SubTable, 'pt_text', +1),
        # If eof, we're done
        (None, EOF, Here, +1, MatchOk),
        # grab a tag and loop, or fail
        ('tag', Table, 'pt_tag', MatchFail, -2),
        )


=== Added File Packages/HTMLStructure/Printer.py ===
from bisect import bisect
from cgi import escape

def printHTML(parsedtext, span_marks):
    out = []
    output = out.append
    linelist = parsedtext.linemap

    #linemap = {}
    #for sm in span_marks:
    #    bline = bisect(linelist, sm[0])
    #    eline = bisect(linelist, sm[1])
    #    if sm[1] == linemap[eline]:
    #        eline -= 1
    #    linemap.setdefault(bline, [])
    #    linemap[bline].append(sm)
    #    if bline != eline:
    #        linemap.setdefault(eline, [])
    #        linemap[eline].append(sm)

    output('<pre class="source">')
    text = parsedtext.text
    lend = 0
    for n in range(len(linelist)):
        output('<a name="%s" class="linenumber">%s</a>' % (n + 1, n + 1))
        lbegin = lend
        lend = linelist[n]
        output(escape(text[lbegin:lend]))
    output('</pre>')
    return ''.join(out)


=== Added File Packages/HTMLStructure/Validator.py ===
from bisect import bisect

class Validator:
    def __init__(self, parsedText):
        self.pt = parsedText
        self.is_empty = IS_EMPTY_HTML_TAG
        self.impclose_enclosing_map = IMPCLOSE_ENCLOSING_MAP
        self.impclose_following_map = IMPCLOSE_FOLLOWING_MAP

    def entities(self):
        '''Check all character entity references'''
        all = []
        errors = []
        self._collect_entities(self.pt.parse[1], all)
        for entity in all:
            sub = entity[3]
            if sub[-1][0] == 'error' or sub[0][2] - sub[0][1] > 32:
                # Malformed or too long
                errors.append(entity)
        return all, errors

    def _collect_entities(self, elements, entities):
        for element in elements:
            sub = element[3]
            if element[0] == 'charref':
                entities.append(element)
            elif sub:
                self._collect_entities(sub, entities)


    def match_tags(self):
        '''Attempt to pair up all tags properly

        We scan the tags backwards, accumulating a stack of closing
        tags and matching them with opening tags.  When a pair of tags
        matches exactly, the 2-tuple (open, close) of the parse
        indexes of the matched tags are added to the "matched" list.

        The parse indexes of unmatched tags are returned in the
        "unclosed" and "unopened" lists.
        '''
        
        # We will return lists of matched and unmatched tags
        matched = []
        unclosed = []
        unopened = []
        
        parse = self.pt.parse[1]
        tagName = self.pt.tagName
        unop_names = []

        # Scan non-empty tags (in reverse, as collect_tags returns them)
        for (i, ttype) in self.collect_tags(empty=0):
            tname = tagName(parse[i])
            if ttype == 'close':
                unopened.append(i)
                unop_names.append(tname)
            elif unopened and unop_names[-1] == tname:
                matched.append((i, unopened.pop()))
                unop_names.pop()
            else:
                unclosed.append(i)
        # Return the lists in forward order
        matched.reverse()
        unopened.reverse()
        unclosed.reverse()
        return matched, unclosed, unopened

    def collect_tags(self, opening=1, closing=1, empty=1):
        '''Return a list of tags (in reverse order).

        Elements of the list are (parse index, tag type) pairs.
        Arguments to the method indicate which types of tag to include.
        '''
        parse = self.pt.parse[1]
        tagName = self.pt.tagName
        tagType = self.pt.tagType
        is_empty = self.is_empty
        octags = []
        # Scan backwards
        i = len(parse)
        while i:
            i = i - 1
            ttype = tagType(parse[i], is_empty)
            # Only bother with open/close/empty tags
            if ttype is None:
                continue
            # Skip tag types that aren't asked for
            if ((opening and ttype == 'open') or
                (closing and ttype == 'close') or
                (empty and ttype == 'empty')):
                octags.append((i, ttype))
        return octags

    def implicit_match_tags(self, matched, unclosed):
        '''Try to find implicit closing tags for unmatched opening tags.

        This should be called with the output of the "match_tags"
        method.  For each unmatched opening tag that can be implicitly
        closed, we scan forward for a tag capable of closing it.

        An implicitly closable tag can only be closed if it has and
        enclosing tag that is 'willing', or there is a subsequent tag
        at the same nesting level 

        '''
        
        # Save effort
        if not unclosed:
            return (), ()
        parse = self.pt.parse[1]
        tagName = self.pt.tagName
        impclose_following_map = self.impclose_following_map
        impclose_enclosing_map = self.impclose_enclosing_map

        new_matched = []
        still_unclosed = []
        for pidx in unclosed:
            tag = parse[pidx]
            tname = tagName(tag)
            enc_closers = impclose_enclosing_map.get(tname)
            if enc_closers is None:
                still_unclosed.append(pidx)
            else:
                tag_iter = Parallel_advance(matched, unclosed, pidx)
                closers = impclose_following_map[tname]
                while 1:
                    idx = tag_iter.next()
                    if idx is None: break
                    ftname = tagName(parse[idx])
                    if ftname in closers:
                        break
                if idx is None:
                    # We didn't find a following closer
                    enc_tag = tag_iter.enc_tag
                    if enc_tag is None:
                        if "" in enc_closers:
                            # Implicit close at EOF
                            idx = -1
                    else:
                        etname = tagName(parse[enc_tag[0]])
                        if etname in enc_closers or "" in enc_closers:
                            # Implicit close at end of enclosing tag
                            idx = enc_tag[1]
                if idx is None:
                    still_unclosed.append(pidx)
                else:
                    new_matched.append((pidx, idx))
        return new_matched, still_unclosed

class Parallel_advance:
    stop = 0
    enc_tag = None
    def __init__(self, matched, unclosed, pidx):
        #import pdb; pdb.set_trace()
        self.matched = matched
        self.unclosed = unclosed
        mpos = bisect(matched, (pidx,))
        while mpos and matched[mpos - 1][1] < pidx:
            mpos -= 1
        if mpos:
            self.enc_tag = matched[mpos - 1]
            self.stop = self.enc_tag[1]
        self.scan = pidx + 1
    def next_unclosed(self):
        unclosed = self.unclosed
        pos = bisect(unclosed, self.scan)
        if pos and unclosed[pos - 1] == self.scan:
            pos -= 1
        if pos < len(unclosed):
            idx = unclosed[pos]
            if not self.stop or idx < self.stop:
                return idx
    def next_matched(self):
        matched = self.matched
        pos = bisect(matched, (self.scan,))
        if pos < len(matched):
            tag = matched[pos]
            if not self.stop or tag[0] < self.stop:
                return tag
    def next(self):
        uidx = self.next_unclosed()
        mtag = self.next_matched()
        if uidx is None and mtag is None:
            return
        if uidx is None or (mtag is not None and mtag[0] < uidx):
            self.scan = mtag[1] + 1
            return mtag[0]
        elif mtag is None or uidx < mtag[0]:
            self.scan = uidx + 1
            return uidx

def nest_tags(matched):
    '''Create a nested tag data structure from a list of pairs.'''
    tags = []
    stack = []
    for span in matched:
        tag = [span]
        # Pop tags off of the stack that we're to the right of.
        while stack:
            top = stack[-1]
            if span[0] > top[0][1]:
                stack.pop()
            else:
                top.append(tag)
                break
        else:
            tags.append(tag)
        stack.append(tag)
    return tags

def listTest(alist):
    m = {}
    for item in alist:
        m[item] = 1
    return m.has_key

EMPTY_HTML_TAGS = [
    # List of HTML tags with an empty content model; these are
    # rendered in minimized form, e.g. <img />.
    # From http://www.w3.org/TR/xhtml1/#dtds
    "base", "meta", "link", "hr", "br", "param", "img", "area",
    "input", "col", "basefont", "isindex", "frame",
    ]
IS_EMPTY_HTML_TAG = listTest(EMPTY_HTML_TAGS)

PARA_LEVEL_HTML_TAGS = [
    # List of HTML elements that close open paragraph-level elements
    # and are themselves paragraph-level.
    "h1", "h2", "h3", "h4", "h5", "h6", "p",
    ]

BLOCK_LEVEL_HTML_TAGS = [
    # List of HTML tags that denote larger sections than paragraphs.
    "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
    "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div",
    ]

IS_PARA_OR_BLOCK_LEVEL = listTest(PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS)

IMPCLOSE_ENCLOSING_MAP = {
    # Mapping of implicitly closable HTML elements to enclosing tags
    # that are 'willing' to close them.
    "p": BLOCK_LEVEL_HTML_TAGS + [""],
    "tr": ("table", "thead", "tfoot", "tbody"),
    "td": ("tr",),
    "th": ("tr",),
    "li": ("ul", "ol"),
    "dd": ("dl",),
    "dt": ("dl",),
    }

IMPCLOSE_FOLLOWING_MAP = {
    "p": PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS + [""],
    "tr": ("tr", "thead", "tfoot", "tbody"),
    "td": ("td", "th"),
    "th": ("td", "th"),
    "li": ("li",),
    "dd": ("dd", "dt"),
    "dt": ("dd", "dt"),
    }



=== Added File Packages/HTMLStructure/__init__.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

'''HTMLStructure package

Contains utilities for parsing, validating, and reformatting HTML
text.  These use mxTextTools for parsing, and follow its model of
tagged text spans.  Unlike the standard HTMLParser, which is
event-driven, this package is data structure-oriented.
'''