[Zope-Checkins] CVS: Zope/lib/python/third_party/docutils/docutils/parsers/rst - __init__.py:1.1.4.1 roles.py:1.1.4.1 states.py:1.1.4.1 tableparser.py:1.1.4.1

Andreas Jung andreas at andreas-jung.com
Fri Oct 29 15:08:23 EDT 2004


Update of /cvs-repository/Zope/lib/python/third_party/docutils/docutils/parsers/rst
In directory cvs.zope.org:/tmp/cvs-serv23727/lib/python/third_party/docutils/docutils/parsers/rst

Added Files:
      Tag: Zope-2_7-branch
	__init__.py roles.py states.py tableparser.py 
Log Message:
moved docutils to lib/python/third_party


=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/__init__.py ===
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.

"""
This is ``docutils.parsers.rst`` package. It exports a single class, `Parser`,
the reStructuredText parser.


Usage
=====

1. Create a parser::

       parser = docutils.parsers.rst.Parser()

   Several optional arguments may be passed to modify the parser's behavior.
   Please see `Customizing the Parser`_ below for details.

2. Gather input (a multi-line string), by reading a file or the standard
   input::

       input = sys.stdin.read()

3. Create a new empty `docutils.nodes.document` tree::

       document = docutils.utils.new_document(source, settings)

   See `docutils.utils.new_document()` for parameter details.

4. Run the parser, populating the document tree::

       parser.parse(input, document)


Parser Overview
===============

The reStructuredText parser is implemented as a state machine, examining its
input one line at a time. To understand how the parser works, please first
become familiar with the `docutils.statemachine` module, then see the
`states` module.


Customizing the Parser
----------------------

Anything that isn't already customizable is that way simply because that type
of customizability hasn't been implemented yet.  Patches welcome!

When instantiating an object of the `Parser` class, two parameters may be
passed: ``rfc2822`` and ``inliner``.  Pass ``rfc2822=1`` to enable an initial
RFC-2822 style header block, parsed as a "field_list" element (with "class"
attribute set to "rfc2822").  Currently this is the only body-level element
which is customizable without subclassing.  (Tip: subclass `Parser` and change
its "state_classes" and "initial_state" attributes to refer to new classes.
Contact the author if you need more details.)

The ``inliner`` parameter takes an instance of `states.Inliner` or a subclass.
It handles inline markup recognition.  A common extension is the addition of
further implicit hyperlinks, like "RFC 2822".  This can be done by subclassing
`states.Inliner`, adding a new method for the implicit markup, and adding a
``(pattern, method)`` pair to the "implicit_dispatch" attribute of the
subclass.  See `states.Inliner.implicit_inline()` for details.  Explicit
inline markup can be customized in a `states.Inliner` subclass via the
``patterns.initial`` and ``dispatch`` attributes (and new methods as
appropriate).
"""

__docformat__ = 'reStructuredText'


import docutils.parsers
import docutils.statemachine
from docutils.parsers.rst import states
from docutils import frontend


class Parser(docutils.parsers.Parser):

    """The reStructuredText parser."""

    supported = ('restructuredtext', 'rst', 'rest', 'restx', 'rtxt', 'rstx')
    """Aliases this parser supports."""

    settings_spec = (
        'reStructuredText Parser Options',
        None,
        (('Recognize and link to standalone PEP references (like "PEP 258").',
          ['--pep-references'],
          {'action': 'store_true', 'validator': frontend.validate_boolean}),
         ('Base URL for PEP references '
          '(default "http://www.python.org/peps/").',
          ['--pep-base-url'],
          {'metavar': '<URL>', 'default': 'http://www.python.org/peps/',
           'validator': frontend.validate_url_trailing_slash}),
         ('Recognize and link to standalone RFC references (like "RFC 822").',
          ['--rfc-references'],
          {'action': 'store_true', 'validator': frontend.validate_boolean}),
         ('Base URL for RFC references (default "http://www.faqs.org/rfcs/").',
          ['--rfc-base-url'],
          {'metavar': '<URL>', 'default': 'http://www.faqs.org/rfcs/',
           'validator': frontend.validate_url_trailing_slash}),
         ('Set number of spaces for tab expansion (default 8).',
          ['--tab-width'],
          {'metavar': '<width>', 'type': 'int', 'default': 8}),
         ('Remove spaces before footnote references.',
          ['--trim-footnote-reference-space'],
          {'action': 'store_true', 'validator': frontend.validate_boolean}),))

    config_section = 'restructuredtext parser'
    config_section_dependencies = ('parsers',)

    def __init__(self, rfc2822=None, inliner=None):
        if rfc2822:
            self.initial_state = 'RFC2822Body'
        else:
            self.initial_state = 'Body'
        self.state_classes = states.state_classes
        self.inliner = inliner

    def parse(self, inputstring, document):
        """Parse `inputstring` and populate `document`, a document tree."""
        self.setup_parse(inputstring, document)
        debug = document.reporter[''].debug
        self.statemachine = states.RSTStateMachine(
              state_classes=self.state_classes,
              initial_state=self.initial_state,
              debug=debug)
        inputlines = docutils.statemachine.string2lines(
              inputstring, tab_width=document.settings.tab_width,
              convert_whitespace=1)
        self.statemachine.run(inputlines, document, inliner=self.inliner)
        self.finish_parse()


=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/roles.py ===
# Author: Edward Loper
# Contact: edloper at gradient.cis.upenn.edu
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.

"""
This module defines standard interpreted text role functions, a registry for
interpreted text roles, and an API for adding to and retrieving from the
registry.

The interface for interpreted role functions is as follows::

    def role_fn(name, rawtext, text, lineno, inliner,
                options={}, content=[]):
        code...

    # Set function attributes for customization:
    role_fn.options = ...
    role_fn.content = ...

Parameters:

- ``name`` is the local name of the interpreted text role, the role name
  actually used in the document.

- ``rawtext`` is a string containing the entire interpreted text construct.
  Return it as a ``problematic`` node linked to a system message if there is a
  problem.

- ``text`` is the interpreted text content.

- ``lineno`` is the line number where the interpreted text beings.

- ``inliner`` is the Inliner object that called the role function.
  It defines the following useful attributes: ``reporter``,
  ``problematic``, ``memo``, ``parent``, ``document``.

- ``options``: A dictionary of directive options for customization, to be
  interpreted by the role function.  Used for additional attributes for the
  generated elements and other functionality.

- ``content``: A list of strings, the directive content for customization
  ("role" directive).  To be interpreted by the role function.

Function attributes for customization, interpreted by the "role" directive:

- ``options``: A dictionary, mapping known option names to conversion
  functions such as `int` or `float`.  ``None`` or an empty dict implies no
  options to parse.  Several directive option conversion functions are defined
  in the `directives` module.

  All role functions implicitly support the "class" option, unless disabled
  with an explicit ``{'class': None}``.

- ``content``: A boolean; true if content is allowed.  Client code must handle
  the case where content is required but not supplied (an empty content list
  will be supplied).

Note that unlike directives, the "arguments" function attribute is not
supported for role customization.  Directive arguments are handled by the
"role" directive itself.

Interpreted role functions return a tuple of two values:

- A list of nodes which will be inserted into the document tree at the
  point where the interpreted role was encountered (can be an empty
  list).

- A list of system messages, which will be inserted into the document tree
  immediately after the end of the current inline block (can also be empty).
"""

__docformat__ = 'reStructuredText'

from docutils import nodes
from docutils.parsers.rst import directives
from docutils.parsers.rst.languages import en as _fallback_language_module

DEFAULT_INTERPRETED_ROLE = 'title-reference'
"""
The canonical name of the default interpreted role.  This role is used
when no role is specified for a piece of interpreted text.
"""

_role_registry = {}
"""Mapping of canonical role names to role functions.  Language-dependent role
names are defined in the ``language`` subpackage."""

_roles = {}
"""Mapping of local or language-dependent interpreted text role names to role
functions."""

def role(role_name, language_module, lineno, reporter):
    """
    Locate and return a role function from its language-dependent name, along
    with a list of system messages.  If the role is not found in the current
    language, check English.  Return a 2-tuple: role function (``None`` if the
    named role cannot be found) and a list of system messages.
    """
    normname = role_name.lower()
    messages = []
    msg_text = []

    if _roles.has_key(normname):
        return _roles[normname], messages

    if role_name:
        canonicalname = None
        try:
            canonicalname = language_module.roles[normname]
        except AttributeError, error:
            msg_text.append('Problem retrieving role entry from language '
                            'module %r: %s.' % (language_module, error))
        except KeyError:
            msg_text.append('No role entry for "%s" in module "%s".'
                            % (role_name, language_module.__name__))
    else:
        canonicalname = DEFAULT_INTERPRETED_ROLE

    # If we didn't find it, try English as a fallback.
    if not canonicalname:
        try:
            canonicalname = _fallback_language_module.roles[normname]
            msg_text.append('Using English fallback for role "%s".'
                            % role_name)
        except KeyError:
            msg_text.append('Trying "%s" as canonical role name.'
                            % role_name)
            # The canonical name should be an English name, but just in case:
            canonicalname = normname

    # Collect any messages that we generated.
    if msg_text:
        message = reporter.info('\n'.join(msg_text), line=lineno)
        messages.append(message)

    # Look the role up in the registry, and return it.
    if _role_registry.has_key(canonicalname):
        role_fn = _role_registry[canonicalname]
        register_local_role(normname, role_fn)
        return role_fn, messages
    else:
        return None, messages # Error message will be generated by caller.

def register_canonical_role(name, role_fn):
    """
    Register an interpreted text role by its canonical name.

    :Parameters:
      - `name`: The canonical name of the interpreted role.
      - `role_fn`: The role function.  See the module docstring.
    """
    set_implicit_options(role_fn)
    _role_registry[name] = role_fn

def register_local_role(name, role_fn):
    """
    Register an interpreted text role by its local or language-dependent name.

    :Parameters:
      - `name`: The local or language-dependent name of the interpreted role.
      - `role_fn`: The role function.  See the module docstring.
    """
    set_implicit_options(role_fn)
    _roles[name] = role_fn

def set_implicit_options(role_fn):
    """
    Add customization options to role functions, unless explicitly set or
    disabled.
    """
    if not hasattr(role_fn, 'options') or role_fn.options is None:
        role_fn.options = {'class': directives.class_option}
    elif not role_fn.options.has_key('class'):
        role_fn.options['class'] = directives.class_option    

def register_generic_role(canonical_name, node_class):
    """For roles which simply wrap a given `node_class` around the text."""
    role = GenericRole(canonical_name, node_class)
    register_canonical_role(canonical_name, role)


class GenericRole:

    """
    Generic interpreted text role, where the interpreted text is simply
    wrapped with the provided node class.
    """

    def __init__(self, role_name, node_class):
        self.name = role_name
        self.node_class = node_class

    def __call__(self, role, rawtext, text, lineno, inliner,
                 options={}, content=[]):
        return [self.node_class(rawtext, text, **options)], []


class CustomRole:

    """
    Wrapper for custom interpreted text roles.
    """

    def __init__(self, role_name, base_role, options={}, content=[]):
        self.name = role_name
        self.base_role = base_role
        self.options = None
        if hasattr(base_role, 'options'):
            self.options = base_role.options
        self.content = None
        if hasattr(base_role, 'content'):
            self.content = base_role.content
        self.supplied_options = options
        self.supplied_content = content

    def __call__(self, role, rawtext, text, lineno, inliner,
                 options={}, content=[]):
        opts = self.supplied_options.copy()
        opts.update(options)
        cont = list(self.supplied_content)
        if cont and content:
            cont += '\n'
        cont.extend(content)
        return self.base_role(role, rawtext, text, lineno, inliner,
                              options=opts, content=cont)


def generic_custom_role(role, rawtext, text, lineno, inliner,
                        options={}, content=[]):
    """"""
    # Once nested inline markup is implemented, this and other methods should
    # recursively call inliner.nested_parse().
    return [nodes.inline(rawtext, text, **options)], []

generic_custom_role.options = {'class': directives.class_option}


######################################################################
# Define and register the standard roles:
######################################################################

register_generic_role('abbreviation', nodes.abbreviation)
register_generic_role('acronym', nodes.acronym)
register_generic_role('emphasis', nodes.emphasis)
register_generic_role('literal', nodes.literal)
register_generic_role('strong', nodes.strong)
register_generic_role('subscript', nodes.subscript)
register_generic_role('superscript', nodes.superscript)
register_generic_role('title-reference', nodes.title_reference)

def pep_reference_role(role, rawtext, text, lineno, inliner,
                       options={}, content=[]):
    try:
        pepnum = int(text)
        if pepnum < 0 or pepnum > 9999:
            raise ValueError
    except ValueError:
        msg = inliner.reporter.error(
            'PEP number must be a number from 0 to 9999; "%s" is invalid.'
            % text, line=lineno)
        prb = inliner.problematic(rawtext, rawtext, msg)
        return [prb], [msg]
    # Base URL mainly used by inliner.pep_reference; so this is correct:
    ref = inliner.document.settings.pep_base_url + inliner.pep_url % pepnum
    return [nodes.reference(rawtext, 'PEP ' + text, refuri=ref, **options)], []

register_canonical_role('pep-reference', pep_reference_role)

def rfc_reference_role(role, rawtext, text, lineno, inliner,
                       options={}, content=[]):
    try:
        rfcnum = int(text)
        if rfcnum <= 0:
            raise ValueError
    except ValueError:
        msg = inliner.reporter.error(
            'RFC number must be a number greater than or equal to 1; '
            '"%s" is invalid.' % text, line=lineno)
        prb = inliner.problematic(rawtext, rawtext, msg)
        return [prb], [msg]
    # Base URL mainly used by inliner.rfc_reference, so this is correct:
    ref = inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum
    node = nodes.reference(rawtext, 'RFC ' + text, refuri=ref, **options)
    return [node], []

register_canonical_role('rfc-reference', rfc_reference_role)


######################################################################
# Register roles that are currently unimplemented.
######################################################################

def unimplemented_role(role, rawtext, text, lineno, inliner, attributes={}):
    msg = inliner.reporter.error(
        'Interpreted text role "%s" not implemented.' % role, line=lineno)
    prb = inliner.problematic(rawtext, rawtext, msg)
    return [prb], [msg]

register_canonical_role('index', unimplemented_role)
register_canonical_role('named-reference', unimplemented_role)
register_canonical_role('anonymous-reference', unimplemented_role)
register_canonical_role('uri-reference', unimplemented_role)
register_canonical_role('footnote-reference', unimplemented_role)
register_canonical_role('citation-reference', unimplemented_role)
register_canonical_role('substitution-reference', unimplemented_role)
register_canonical_role('target', unimplemented_role)

# This should remain unimplemented, for testing purposes:
register_canonical_role('restructuredtext-unimplemented-role',
                        unimplemented_role)


=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/states.py === (2484/2884 lines abridged)
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.

"""
This is the ``docutils.parsers.restructuredtext.states`` module, the core of
the reStructuredText parser.  It defines the following:

:Classes:
    - `RSTStateMachine`: reStructuredText parser's entry point.
    - `NestedStateMachine`: recursive StateMachine.
    - `RSTState`: reStructuredText State superclass.
    - `Inliner`: For parsing inline markup.
    - `Body`: Generic classifier of the first line of a block.
    - `SpecializedBody`: Superclass for compound element members.
    - `BulletList`: Second and subsequent bullet_list list_items
    - `DefinitionList`: Second+ definition_list_items.
    - `EnumeratedList`: Second+ enumerated_list list_items.
    - `FieldList`: Second+ fields.
    - `OptionList`: Second+ option_list_items.
    - `RFC2822List`: Second+ RFC2822-style fields.
    - `ExtensionOptions`: Parses directive option fields.
    - `Explicit`: Second+ explicit markup constructs.
    - `SubstitutionDef`: For embedded directives in substitution definitions.
    - `Text`: Classifier of second line of a text block.
    - `SpecializedText`: Superclass for continuation lines of Text-variants.
    - `Definition`: Second line of potential definition_list_item.
    - `Line`: Second line of overlined section title or transition marker.
    - `Struct`: An auxiliary collection class.

:Exception classes:
    - `MarkupError`
    - `ParserError`
    - `MarkupMismatch`

:Functions:
    - `escape2null()`: Return a string, escape-backslashes converted to nulls.
    - `unescape()`: Return a string, nulls removed or restored to backslashes.

:Attributes:
    - `state_classes`: set of State classes used with `RSTStateMachine`.

Parser Overview
===============

The reStructuredText parser is implemented as a recursive state machine,
examining its input one line at a time.  To understand how the parser works,
please first become familiar with the `docutils.statemachine` module.  In the
description below, references are made to classes defined in this module;
please see the individual classes for details.

Parsing proceeds as follows:

1. The state machine examines each line of input, checking each of the
   transition patterns of the state `Body`, in order, looking for a match.
   The implicit transitions (blank lines and indentation) are checked before
   any others.  The 'text' transition is a catch-all (matches anything).

2. The method associated with the matched transition pattern is called.

   A. Some transition methods are self-contained, appending elements to the
      document tree (`Body.doctest` parses a doctest block).  The parser's
      current line index is advanced to the end of the element, and parsing
      continues with step 1.

   B. Other transition methods trigger the creation of a nested state machine,
      whose job is to parse a compound construct ('indent' does a block quote,
      'bullet' does a bullet list, 'overline' does a section [first checking
      for a valid section header], etc.).

      - In the case of lists and explicit markup, a one-off state machine is
        created and run to parse contents of the first item.

      - A new state machine is created and its initial state is set to the
        appropriate specialized state (`BulletList` in the case of the
        'bullet' transition; see `SpecializedBody` for more detail).  This
        state machine is run to parse the compound element (or series of
        explicit markup elements), and returns as soon as a non-member element
        is encountered.  For example, the `BulletList` state machine ends as
        soon as it encounters an element which is not a list item of that
        bullet list.  The optional omission of inter-element blank lines is
        enabled by this nested state machine.

      - The current line index is advanced to the end of the elements parsed,
        and parsing continues with step 1.

   C. The result of the 'text' transition depends on the next line of text.
      The current state is changed to `Text`, under which the second line is
      examined.  If the second line is:

      - Indented: The element is a definition list item, and parsing proceeds
        similarly to step 2.B, using the `DefinitionList` state.

      - A line of uniform punctuation characters: The element is a section
        header; again, parsing proceeds as in step 2.B, and `Body` is still
        used.

      - Anything else: The element is a paragraph, which is examined for
        inline markup and appended to the parent element.  Processing
        continues with step 1.
"""

__docformat__ = 'reStructuredText'


import sys
import re
import roman
from types import TupleType
from docutils import nodes, statemachine, utils, urischemes
from docutils import ApplicationError, DataError
from docutils.statemachine import StateMachineWS, StateWS
from docutils.nodes import fully_normalize_name as normalize_name
from docutils.nodes import whitespace_normalize_name
from docutils.parsers.rst import directives, languages, tableparser, roles
from docutils.parsers.rst.languages import en as _fallback_language_module


class MarkupError(DataError): pass
class UnknownInterpretedRoleError(DataError): pass
class InterpretedRoleNotImplementedError(DataError): pass
class ParserError(ApplicationError): pass
class MarkupMismatch(Exception): pass


class Struct:

    """Stores data attributes for dotted-attribute access."""

    def __init__(self, **keywordargs):
        self.__dict__.update(keywordargs)


class RSTStateMachine(StateMachineWS):

    """
    reStructuredText's master StateMachine.

    The entry point to reStructuredText parsing is the `run()` method.
    """

    def run(self, input_lines, document, input_offset=0, match_titles=1,
            inliner=None):
        """
        Parse `input_lines` and return a `docutils.nodes.document` instance.

        Extend `StateMachineWS.run()`: set up parse-global data, run the
        StateMachine, and return the resulting
        document.
        """
        self.language = languages.get_language(
            document.settings.language_code)
        self.match_titles = match_titles
        if inliner is None:
            inliner = Inliner()
        inliner.init_customizations(document.settings)
        self.memo = Struct(document=document,
                           reporter=document.reporter,
                           language=self.language,
                           title_styles=[],
                           section_level=0,
                           section_bubble_up_kludge=0,
                           inliner=inliner)
        self.document = document
        self.attach_observer(document.note_source)
        self.reporter = self.memo.reporter
        self.node = document
        results = StateMachineWS.run(self, input_lines, input_offset,
                                     input_source=document['source'])
        assert results == [], 'RSTStateMachine.run() results should be empty!'
        self.check_document()
        self.node = self.memo = None    # remove unneeded references

    def check_document(self):
        """Check for illegal structure: empty document."""
        if len(self.document) == 0:
            error = self.reporter.error(
                'Document empty; must have contents.', line=0)
            self.document += error


class NestedStateMachine(StateMachineWS):

    """
    StateMachine run from within other StateMachine runs, to parse nested
    document structures.
    """

    def run(self, input_lines, input_offset, memo, node, match_titles=1):
        """
        Parse `input_lines` and populate a `docutils.nodes.document` instance.

        Extend `StateMachineWS.run()`: set up document-wide data.
        """
        self.match_titles = match_titles
        self.memo = memo
        self.document = memo.document
        self.attach_observer(self.document.note_source)

[-=- -=- -=- 2484 lines omitted -=- -=- -=-]

        if len(self.parent) == 0:
            msg = self.reporter.error(
                  'Document or section may not begin with a transition.',
                  line=lineno)
            self.parent += msg
        elif isinstance(self.parent[-1], nodes.transition):
            msg = self.reporter.error(
                  'At least one body element must separate transitions; '
                  'adjacent transitions not allowed.',
                  line=lineno)
            self.parent += msg
        self.parent += transition
        return [], 'Body', []

    def text(self, match, context, next_state):
        """Potential over- & underlined title."""
        lineno = self.state_machine.abs_line_number() - 1
        overline = context[0]
        title = match.string
        underline = ''
        try:
            underline = self.state_machine.next_line()
        except EOFError:
            blocktext = overline + '\n' + title
            if len(overline.rstrip()) < 4:
                self.short_overline(context, blocktext, lineno, 2)
            else:
                msg = self.reporter.severe(
                    'Incomplete section title.',
                    nodes.literal_block(blocktext, blocktext), line=lineno)
                self.parent += msg
                return [], 'Body', []
        source = '%s\n%s\n%s' % (overline, title, underline)
        overline = overline.rstrip()
        underline = underline.rstrip()
        if not self.transitions['underline'][0].match(underline):
            blocktext = overline + '\n' + title + '\n' + underline
            if len(overline.rstrip()) < 4:
                self.short_overline(context, blocktext, lineno, 2)
            else:
                msg = self.reporter.severe(
                    'Missing matching underline for section title overline.',
                    nodes.literal_block(source, source), line=lineno)
                self.parent += msg
                return [], 'Body', []
        elif overline != underline:
            blocktext = overline + '\n' + title + '\n' + underline
            if len(overline.rstrip()) < 4:
                self.short_overline(context, blocktext, lineno, 2)
            else:
                msg = self.reporter.severe(
                      'Title overline & underline mismatch.',
                      nodes.literal_block(source, source), line=lineno)
                self.parent += msg
                return [], 'Body', []
        title = title.rstrip()
        messages = []
        if len(title) > len(overline):
            blocktext = overline + '\n' + title + '\n' + underline
            if len(overline.rstrip()) < 4:
                self.short_overline(context, blocktext, lineno, 2)
            else:
                msg = self.reporter.warning(
                      'Title overline too short.',
                      nodes.literal_block(source, source), line=lineno)
                messages.append(msg)
        style = (overline[0], underline[0])
        self.eofcheck = 0               # @@@ not sure this is correct
        self.section(title.lstrip(), source, style, lineno + 1, messages)
        self.eofcheck = 1
        return [], 'Body', []

    indent = text                       # indented title

    def underline(self, match, context, next_state):
        overline = context[0]
        blocktext = overline + '\n' + self.state_machine.line
        lineno = self.state_machine.abs_line_number() - 1
        if len(overline.rstrip()) < 4:
            self.short_overline(context, blocktext, lineno, 1)
        msg = self.reporter.error(
              'Invalid section title or transition marker.',
              nodes.literal_block(blocktext, blocktext), line=lineno)
        self.parent += msg
        return [], 'Body', []

    def short_overline(self, context, blocktext, lineno, lines=1):
        msg = self.reporter.info(
            'Possible incomplete section title.\nTreating the overline as '
            "ordinary text because it's so short.", line=lineno)
        self.parent += msg
        self.state_correction(context, lines)

    def state_correction(self, context, lines=1):
        self.state_machine.previous_line(lines)
        context[:] = []
        raise statemachine.StateCorrection('Body', 'text')


class QuotedLiteralBlock(RSTState):

    """
    Nested parse handler for quoted (unindented) literal blocks.

    Special-purpose.  Not for inclusion in `state_classes`.
    """

    patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
                'text': r''}
    initial_transitions = ('initial_quoted', 'text')

    def __init__(self, state_machine, debug=0):
        RSTState.__init__(self, state_machine, debug)
        self.messages = []
        self.initial_lineno = None

    def blank(self, match, context, next_state):
        if context:
            raise EOFError
        else:
            return context, next_state, []

    def eof(self, context):
        if context:
            text = '\n'.join(context)
            literal_block = nodes.literal_block(text, text)
            literal_block.line = self.initial_lineno
            self.parent += literal_block
        else:
            self.parent += self.reporter.warning(
                'Literal block expected; none found.',
                line=self.state_machine.abs_line_number())
            self.state_machine.previous_line()
        self.parent += self.messages
        return []

    def indent(self, match, context, next_state):
        assert context, ('QuotedLiteralBlock.indent: context should not '
                         'be empty!')
        self.messages.append(
            self.reporter.error('Unexpected indentation.',
                                line=self.state_machine.abs_line_number()))
        self.state_machine.previous_line()
        raise EOFError

    def initial_quoted(self, match, context, next_state):
        """Match arbitrary quote character on the first line only."""
        self.remove_transition('initial_quoted')
        quote = match.string[0]
        pattern = re.compile(re.escape(quote))
        # New transition matches consistent quotes only:
        self.add_transition('quoted',
                            (pattern, self.quoted, self.__class__.__name__))
        self.initial_lineno = self.state_machine.abs_line_number()
        return [match.string], next_state, []

    def quoted(self, match, context, next_state):
        """Match consistent quotes on subsequent lines."""
        context.append(match.string)
        return context, next_state, []

    def text(self, match, context, next_state):
        if context:
            self.messages.append(
                self.reporter.error('Inconsistent literal block quoting.',
                                    line=self.state_machine.abs_line_number()))
            self.state_machine.previous_line()
        raise EOFError


state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
                 OptionList, ExtensionOptions, Explicit, Text, Definition,
                 Line, SubstitutionDef, RFC2822Body, RFC2822List)
"""Standard set of State classes used to start `RSTStateMachine`."""


def escape2null(text):
    """Return a string with escape-backslashes converted to nulls."""
    parts = []
    start = 0
    while 1:
        found = text.find('\\', start)
        if found == -1:
            parts.append(text[start:])
            return ''.join(parts)
        parts.append(text[start:found])
        parts.append('\x00' + text[found+1:found+2])
        start = found + 2               # skip character after escape

def unescape(text, restore_backslashes=0):
    """
    Return a string with nulls removed or restored to backslashes.
    Backslash-escaped spaces are also removed.
    """
    if restore_backslashes:
        return text.replace('\x00', '\\')
    else:
        for sep in ['\x00 ', '\x00\n', '\x00']:
            text = ''.join(text.split(sep))
        return text


=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/tableparser.py ===
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.

"""
This module defines table parser classes,which parse plaintext-graphic tables
and produce a well-formed data structure suitable for building a CALS table.

:Classes:
    - `GridTableParser`: Parse fully-formed tables represented with a grid.
    - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
      borders.

:Exception class: `TableMarkupError`

:Function:
    `update_dict_of_lists()`: Merge two dictionaries containing list values.
"""

__docformat__ = 'reStructuredText'


import re
import sys
from docutils import DataError


class TableMarkupError(DataError): pass


class TableParser:

    """
    Abstract superclass for the common parts of the syntax-specific parsers.
    """

    head_body_separator_pat = None
    """Matches the row separator between head rows and body rows."""

    def parse(self, block):
        """
        Analyze the text `block` and return a table data structure.

        Given a plaintext-graphic table in `block` (list of lines of text; no
        whitespace padding), parse the table, construct and return the data
        necessary to construct a CALS table or equivalent.

        Raise `TableMarkupError` if there is any problem with the markup.
        """
        self.setup(block)
        self.find_head_body_sep()
        self.parse_table()
        structure = self.structure_from_cells()
        return structure

    def find_head_body_sep(self):
        """Look for a head/body row separator line; store the line index."""
        for i in range(len(self.block)):
            line = self.block[i]
            if self.head_body_separator_pat.match(line):
                if self.head_body_sep:
                    raise TableMarkupError(
                        'Multiple head/body row separators in table (at line '
                        'offset %s and %s); only one allowed.'
                        % (self.head_body_sep, i))
                else:
                    self.head_body_sep = i
                    self.block[i] = line.replace('=', '-')
        if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
                                                             - 1):
            raise TableMarkupError('The head/body row separator may not be '
                                   'the first or last line of the table.')


class GridTableParser(TableParser):

    """
    Parse a grid table using `parse()`.

    Here's an example of a grid table::

        +------------------------+------------+----------+----------+
        | Header row, column 1   | Header 2   | Header 3 | Header 4 |
        +========================+============+==========+==========+
        | body row 1, column 1   | column 2   | column 3 | column 4 |
        +------------------------+------------+----------+----------+
        | body row 2             | Cells may span columns.          |
        +------------------------+------------+---------------------+
        | body row 3             | Cells may  | - Table cells       |
        +------------------------+ span rows. | - contain           |
        | body row 4             |            | - body elements.    |
        +------------------------+------------+---------------------+

    Intersections use '+', row separators use '-' (except for one optional
    head/body row separator, which uses '='), and column separators use '|'.

    Passing the above table to the `parse()` method will result in the
    following data structure::

        ([24, 12, 10, 10],
         [[(0, 0, 1, ['Header row, column 1']),
           (0, 0, 1, ['Header 2']),
           (0, 0, 1, ['Header 3']),
           (0, 0, 1, ['Header 4'])]],
         [[(0, 0, 3, ['body row 1, column 1']),
           (0, 0, 3, ['column 2']),
           (0, 0, 3, ['column 3']),
           (0, 0, 3, ['column 4'])],
          [(0, 0, 5, ['body row 2']),
           (0, 2, 5, ['Cells may span columns.']),
           None,
           None],
          [(0, 0, 7, ['body row 3']),
           (1, 0, 7, ['Cells may', 'span rows.', '']),
           (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
           None],
          [(0, 0, 9, ['body row 4']), None, None, None]])

    The first item is a list containing column widths (colspecs). The second
    item is a list of head rows, and the third is a list of body rows. Each
    row contains a list of cells. Each cell is either None (for a cell unused
    because of another cell's span), or a tuple. A cell tuple contains four
    items: the number of extra rows used by the cell in a vertical span
    (morerows); the number of extra columns used by the cell in a horizontal
    span (morecols); the line offset of the first line of the cell contents;
    and the cell contents, a list of lines of text.
    """

    head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')

    def setup(self, block):
        self.block = block[:]           # make a copy; it may be modified
        self.block.disconnect()         # don't propagate changes to parent
        self.bottom = len(block) - 1
        self.right = len(block[0]) - 1
        self.head_body_sep = None
        self.done = [-1] * len(block[0])
        self.cells = []
        self.rowseps = {0: [0]}
        self.colseps = {0: [0]}

    def parse_table(self):
        """
        Start with a queue of upper-left corners, containing the upper-left
        corner of the table itself. Trace out one rectangular cell, remember
        it, and add its upper-right and lower-left corners to the queue of
        potential upper-left corners of further cells. Process the queue in
        top-to-bottom order, keeping track of how much of each text column has
        been seen.

        We'll end up knowing all the row and column boundaries, cell positions
        and their dimensions.
        """
        corners = [(0, 0)]
        while corners:
            top, left = corners.pop(0)
            if top == self.bottom or left == self.right \
                  or top <= self.done[left]:
                continue
            result = self.scan_cell(top, left)
            if not result:
                continue
            bottom, right, rowseps, colseps = result
            update_dict_of_lists(self.rowseps, rowseps)
            update_dict_of_lists(self.colseps, colseps)
            self.mark_done(top, left, bottom, right)
            cellblock = self.block.get_2D_block(top + 1, left + 1,
                                                bottom, right)
            cellblock.disconnect()      # lines in cell can't sync with parent
            self.cells.append((top, left, bottom, right, cellblock))
            corners.extend([(top, right), (bottom, left)])
            corners.sort()
        if not self.check_parse_complete():
            raise TableMarkupError('Malformed table; parse incomplete.')

    def mark_done(self, top, left, bottom, right):
        """For keeping track of how much of each text column has been seen."""
        before = top - 1
        after = bottom - 1
        for col in range(left, right):
            assert self.done[col] == before
            self.done[col] = after

    def check_parse_complete(self):
        """Each text column should have been completely seen."""
        last = self.bottom - 1
        for col in range(self.right):
            if self.done[col] != last:
                return None
        return 1

    def scan_cell(self, top, left):
        """Starting at the top-left corner, start tracing out a cell."""
        assert self.block[top][left] == '+'
        result = self.scan_right(top, left)
        return result

    def scan_right(self, top, left):
        """
        Look for the top-right corner of the cell, and make note of all column
        boundaries ('+').
        """
        colseps = {}
        line = self.block[top]
        for i in range(left + 1, self.right + 1):
            if line[i] == '+':
                colseps[i] = [top]
                result = self.scan_down(top, left, i)
                if result:
                    bottom, rowseps, newcolseps = result
                    update_dict_of_lists(colseps, newcolseps)
                    return bottom, i, rowseps, colseps
            elif line[i] != '-':
                return None
        return None

    def scan_down(self, top, left, right):
        """
        Look for the bottom-right corner of the cell, making note of all row
        boundaries.
        """
        rowseps = {}
        for i in range(top + 1, self.bottom + 1):
            if self.block[i][right] == '+':
                rowseps[i] = [right]
                result = self.scan_left(top, left, i, right)
                if result:
                    newrowseps, colseps = result
                    update_dict_of_lists(rowseps, newrowseps)
                    return i, rowseps, colseps
            elif self.block[i][right] != '|':
                return None
        return None

    def scan_left(self, top, left, bottom, right):
        """
        Noting column boundaries, look for the bottom-left corner of the cell.
        It must line up with the starting point.
        """
        colseps = {}
        line = self.block[bottom]
        for i in range(right - 1, left, -1):
            if line[i] == '+':
                colseps[i] = [bottom]
            elif line[i] != '-':
                return None
        if line[left] != '+':
            return None
        result = self.scan_up(top, left, bottom, right)
        if result is not None:
            rowseps = result
            return rowseps, colseps
        return None

    def scan_up(self, top, left, bottom, right):
        """
        Noting row boundaries, see if we can return to the starting point.
        """
        rowseps = {}
        for i in range(bottom - 1, top, -1):
            if self.block[i][left] == '+':
                rowseps[i] = [left]
            elif self.block[i][left] != '|':
                return None
        return rowseps

    def structure_from_cells(self):
        """
        From the data collected by `scan_cell()`, convert to the final data
        structure.
        """
        rowseps = self.rowseps.keys()   # list of row boundaries
        rowseps.sort()
        rowindex = {}
        for i in range(len(rowseps)):
            rowindex[rowseps[i]] = i    # row boundary -> row number mapping
        colseps = self.colseps.keys()   # list of column boundaries
        colseps.sort()
        colindex = {}
        for i in range(len(colseps)):
            colindex[colseps[i]] = i    # column boundary -> col number map
        colspecs = [(colseps[i] - colseps[i - 1] - 1)
                    for i in range(1, len(colseps))] # list of column widths
        # prepare an empty table with the correct number of rows & columns
        onerow = [None for i in range(len(colseps) - 1)]
        rows = [onerow[:] for i in range(len(rowseps) - 1)]
        # keep track of # of cells remaining; should reduce to zero
        remaining = (len(rowseps) - 1) * (len(colseps) - 1)
        for top, left, bottom, right, block in self.cells:
            rownum = rowindex[top]
            colnum = colindex[left]
            assert rows[rownum][colnum] is None, (
                  'Cell (row %s, column %s) already used.'
                  % (rownum + 1, colnum + 1))
            morerows = rowindex[bottom] - rownum - 1
            morecols = colindex[right] - colnum - 1
            remaining -= (morerows + 1) * (morecols + 1)
            # write the cell into the table
            rows[rownum][colnum] = (morerows, morecols, top + 1, block)
        assert remaining == 0, 'Unused cells remaining.'
        if self.head_body_sep:          # separate head rows from body rows
            numheadrows = rowindex[self.head_body_sep]
            headrows = rows[:numheadrows]
            bodyrows = rows[numheadrows:]
        else:
            headrows = []
            bodyrows = rows
        return (colspecs, headrows, bodyrows)


class SimpleTableParser(TableParser):

    """
    Parse a simple table using `parse()`.

    Here's an example of a simple table::

        =====  =====
        col 1  col 2
        =====  =====
        1      Second column of row 1.
        2      Second column of row 2.
               Second line of paragraph.
        3      - Second column of row 3.

               - Second item in bullet
                 list (row 3, column 2).
        4 is a span
        ------------
        5
        =====  =====

    Top and bottom borders use '=', column span underlines use '-', column
    separation is indicated with spaces.

    Passing the above table to the `parse()` method will result in the
    following data structure, whose interpretation is the same as for
    `GridTableParser`::

        ([5, 25],
         [[(0, 0, 1, ['col 1']),
           (0, 0, 1, ['col 2'])]],
         [[(0, 0, 3, ['1']),
           (0, 0, 3, ['Second column of row 1.'])],
          [(0, 0, 4, ['2']),
           (0, 0, 4, ['Second column of row 2.',
                      'Second line of paragraph.'])],
          [(0, 0, 6, ['3']),
           (0, 0, 6, ['- Second column of row 3.',
                      '',
                      '- Second item in bullet',
                      '  list (row 3, column 2).'])],
          [(0, 1, 10, ['4 is a span'])],
          [(0, 0, 12, ['5']),
           (0, 0, 12, [''])]])
    """

    head_body_separator_pat = re.compile('=[ =]*$')
    span_pat = re.compile('-[ -]*$')

    def setup(self, block):
        self.block = block[:]           # make a copy; it will be modified
        self.block.disconnect()         # don't propagate changes to parent
        # Convert top & bottom borders to column span underlines:
        self.block[0] = self.block[0].replace('=', '-')
        self.block[-1] = self.block[-1].replace('=', '-')
        self.head_body_sep = None
        self.columns = []
        self.border_end = None
        self.table = []
        self.done = [-1] * len(block[0])
        self.rowseps = {0: [0]}
        self.colseps = {0: [0]}

    def parse_table(self):
        """
        First determine the column boundaries from the top border, then
        process rows.  Each row may consist of multiple lines; accumulate
        lines until a row is complete.  Call `self.parse_row` to finish the
        job.
        """
        # Top border must fully describe all table columns.
        self.columns = self.parse_columns(self.block[0], 0)
        self.border_end = self.columns[-1][1]
        firststart, firstend = self.columns[0]
        offset = 1                      # skip top border
        start = 1
        text_found = None
        while offset < len(self.block):
            line = self.block[offset]
            if self.span_pat.match(line):
                # Column span underline or border; row is complete.
                self.parse_row(self.block[start:offset], start,
                               (line.rstrip(), offset))
                start = offset + 1
                text_found = None
            elif line[firststart:firstend].strip():
                # First column not blank, therefore it's a new row.
                if text_found and offset != start:
                    self.parse_row(self.block[start:offset], start)
                start = offset
                text_found = 1
            elif not text_found:
                start = offset + 1
            offset += 1

    def parse_columns(self, line, offset):
        """
        Given a column span underline, return a list of (begin, end) pairs.
        """
        cols = []
        end = 0
        while 1:
            begin = line.find('-', end)
            end = line.find(' ', begin)
            if begin < 0:
                break
            if end < 0:
                end = len(line)
            cols.append((begin, end))
        if self.columns:
            if cols[-1][1] != self.border_end:
                raise TableMarkupError('Column span incomplete at line '
                                       'offset %s.' % offset)
            # Allow for an unbounded rightmost column:
            cols[-1] = (cols[-1][0], self.columns[-1][1])
        return cols

    def init_row(self, colspec, offset):
        i = 0
        cells = []
        for start, end in colspec:
            morecols = 0
            try:
                assert start == self.columns[i][0]
                while end != self.columns[i][1]:
                    i += 1
                    morecols += 1
            except (AssertionError, IndexError):
                raise TableMarkupError('Column span alignment problem at '
                                       'line offset %s.' % (offset + 1))
            cells.append([0, morecols, offset, []])
            i += 1
        return cells

    def parse_row(self, lines, start, spanline=None):
        """
        Given the text `lines` of a row, parse it and append to `self.table`.

        The row is parsed according to the current column spec (either
        `spanline` if provided or `self.columns`).  For each column, extract
        text from each line, and check for text in column margins.  Finally,
        adjust for insigificant whitespace.
        """
        if not (lines or spanline):
            # No new row, just blank lines.
            return
        if spanline:
            columns = self.parse_columns(*spanline)
            span_offset = spanline[1]
        else:
            columns = self.columns[:]
            span_offset = start
        self.check_columns(lines, start, columns)
        row = self.init_row(columns, start)
        for i in range(len(columns)):
            start, end = columns[i]
            cellblock = lines.get_2D_block(0, start, len(lines), end)
            cellblock.disconnect()      # lines in cell can't sync with parent
            row[i][3] = cellblock
        self.table.append(row)

    def check_columns(self, lines, first_line, columns):
        """
        Check for text in column margins and text overflow in the last column.
        Raise TableMarkupError if anything but whitespace is in column margins.
        Adjust the end value for the last column if there is text overflow.
        """
        # "Infinite" value for a dummy last column's beginning, used to
        # check for text overflow:
        columns.append((sys.maxint, None))
        lastcol = len(columns) - 2
        for i in range(len(columns) - 1):
            start, end = columns[i]
            nextstart = columns[i+1][0]
            offset = 0
            for line in lines:
                if i == lastcol and line[end:].strip():
                    text = line[start:].rstrip()
                    new_end = start + len(text)
                    columns[i] = (start, new_end)
                    main_start, main_end = self.columns[-1]
                    if new_end > main_end:
                        self.columns[-1] = (main_start, new_end)
                elif line[end:nextstart].strip():
                    raise TableMarkupError('Text in column margin at line '
                                           'offset %s.' % (first_line + offset))
                offset += 1
        columns.pop()

    def structure_from_cells(self):
        colspecs = [end - start for start, end in self.columns]
        first_body_row = 0
        if self.head_body_sep:
            for i in range(len(self.table)):
                if self.table[i][0][2] > self.head_body_sep:
                    first_body_row = i
                    break
        return (colspecs, self.table[:first_body_row],
                self.table[first_body_row:])


def update_dict_of_lists(master, newdata):
    """
    Extend the list values of `master` with those from `newdata`.

    Both parameters must be dictionaries containing list values.
    """
    for key, values in newdata.items():
        master.setdefault(key, []).extend(values)



More information about the Zope-Checkins mailing list