[CMF-checkins] CVS: CMF/CMFCollector - util.py:1.10

Ken Manheimer klm@zope.com
Sat, 3 Nov 2001 00:16:03 -0500


Update of /cvs-repository/CMF/CMFCollector
In directory cvs.zope.org:/tmp/cvs-serv21683

Modified Files:
	util.py 
Log Message:
format_webtext(): simple and more suitable text format for issues.

Characteristics:

 - No html.
 - flow lines beginning at left margin, but break them at carriage returns.
 - Recognize as "literal" lines, whose whitespace is preserved:
   indented lines, lines cited with leading '>', and lines contained
   within <pre> / </pre> pairs
 - Linkify URLs that are not on literal lines.

What's wrong with structured text, and/or HTML, or plain preformatted?

 - Structured text structuring is unnecessary and often *undesirable*.
   Also, structured text flows literal text, not good for source code,
   and most people don't know the secret to prevent that.

 - HTML (and structured text with HTML) interprets and other '<'/'>'
   bracketed text.  Like stx, HTML flows literal text, ...

 - Preformatted requires that the user break lines, or you get an awfully
   wide page.  Using a hard-wrap entry box is *too* restrictive - it
   prevents longer lines, eg in source code, tracebacks, etc.

We can now get rid of process_comment(), unprocess_comments(),
link_candidates(), and list_search_hits()!


=== CMF/CMFCollector/util.py 1.9 => 1.10 ===
 import string, re
 from Products.CMFCore.utils import getToolByName
-
-preexp = re.compile(r'<pre>')
-unpreexp = re.compile(r'</pre>')
-citedexp = re.compile(r'^\s*>')
-# Match group 1 is citation prefix, group 2 is leading whitespace:
-cite_prefixexp = re.compile('([\s>]*>)?([\s]*)')
+from DocumentTemplate.DT_Var import html_quote
 
 def users_for_local_role(object, userids, role):
     """Give only designated userids specified local role.
@@ -86,17 +81,88 @@
     except AttributeError:
         # Some don't support getProperty:
         return getattr(userobj, property, default)
-        
 
-def cited_text(text, rfind=string.rfind, strip=string.strip):
+##############################
+# WebText processing utilities
+preexp = re.compile(r'&lt;pre&gt;')
+unpreexp = re.compile(r'&lt;/pre&gt;')
+urlchars  = (r'[A-Za-z0-9/:@_%~#=&\.\-\?]+')
+nonpuncurlchars  = (r'[A-Za-z0-9/@_%~#=&\-]')
+url       = (r'["=]?((http|https|ftp|mailto|file|about):%s%s)'
+             % (urlchars, nonpuncurlchars))
+urlexp=re.compile(url)
+
+def format_webtext(text,
+                   presearch=preexp.search, presplit=preexp.split,
+                   unpresearch=unpreexp.search, unpresplit=unpreexp.split,
+                   urlexpsub=urlexp.sub):
+    """Transform web text for browser presentation.
+
+    - HTML quote everything
+    - Terminate all lines with <br>s.
+    - Whitespace-quote indented and '>' cited lines
+    - Whitespace-quote lines within <pre>/</pre> pairs
+    - Turn URLs recognized outside of literal regions into links."""
+
+    # Definitions:
+    #
+    # - "in_literal" exemptions: Lines starting with whitespace or '>'
+    # - "in_pre" exemptions: Lines residing within (non-exempted) <pre> tag
+    #
+    # Nuances:
+    #
+    # - Neither exemption can toggle while the other applies - each renders
+    #   the cues for the other mostly ineffective, except...
+    # - in_pre cannot deactivate on a literal-exemption qualifying line, so
+    #   pre tags can be used to contain cited text with (ineffective) </pre>s.
+    # - We mostly don't handle pre tag nesting, except balanced within a line
+
+    in_pre = at_literal = 0
+    got = []
+    for l in text.split('\n'):
+
+        if not l:
+            got.append(l)
+            continue
+
+        l = l.expandtabs()
+
+        at_literal = (l.startswith(" ") or l.startswith("&gt;"))
+
+        if at_literal:
+            # Can't open or close <pre> in literal - since it's cited/escaped.
+            got.append(l.replace(" ", "&nbsp;"))
+                       
+        elif in_pre:
+            got.append(l.replace(" ", "&nbsp;"))
+            # Check for closing pre:
+            x = unpresplit(l)
+            if len(x) > 1 and not presearch(x[-1]):
+                in_pre = 0
+
+        else:
+            # Non-literal:
+            got.append(urlexpsub(r'<a href="\1">\1</a>', l))
+            # Check for opening pre:
+            x = presplit(l)
+            if len(x) > 1 and not unpresearch(x[-1]):
+                # The line has a prevailing <pre>.
+                in_pre = 1
+            
+    return "<br>\n".join(got)
+
+# Match group 1 is citation prefix, group 2 is leading whitespace:
+cite_prefixexp = re.compile('([\s>]*>)?([\s]*)')
+
+def cited_text(text, cite_prefixexp=cite_prefixexp):
     """Quote text for use in literal citations.
 
     We prepend '>' to each line, splitting long lines (propagating
     existing citation and leading whitespace) when necessary."""
-    # (Over?) elaborate stuff snarfed from my wiki commenting provisions.
+    # Over elaborate stuff snarfed from my wiki commenting provisions.
 
     got = []
-    for line in string.split(text, '\n'):
+    for line in text.split('\n'):
         pref = '> '
         if len(line) < 79:
             got.append(pref + line)
@@ -128,15 +194,15 @@
                 break
             else:
                 lastcurlen = curlen
-            splitpoint = max(rfind(line[:78-lenpref], ' '),
-                             rfind(line[:78-lenpref], '\t'))
+            splitpoint = max(line[:78-lenpref].rfind(' '),
+                             line[:78-lenpref].rfind('\t'))
             if not splitpoint or splitpoint == -1:
-                if strip(line):
+                if line.strip():
                     got.append((pref % continuation_padding) +
                                line)
                 line = ''
             else:
-                if strip(line[:splitpoint]):
+                if line[:splitpoint].strip():
                     got.append((pref % continuation_padding) +
                                line[:splitpoint])
                 line = line[splitpoint+1:]
@@ -146,129 +212,7 @@
                 continuation_padding = '  '
     return string.join(got, '\n')
 
-def process_comment(comment, strip=string.strip):
-    """Return formatted comment, escaping cited text."""
-    # More elaborate stuff snarfed from my wiki commenting provisions.
-    # Process the comment:
-    # - Strip leading whitespace,
-    # - cause all cited text to be preformatted.
-
-    inpre = incited = atcited = 0
-    presearch = preexp.search
-    presplit = preexp.split
-    unpresearch = unpreexp.search
-    unpresplit = unpreexp.split
-    citedsearch = citedexp.search
-    got = []
-    for i in string.split('\n' + string.rstrip(comment), '\n') + ['']:
-        atcited = citedsearch(i)
-        if not atcited:
-            if incited:
-                # Departing cited section.
-                incited = 0
-                if inpre:
-                    # Close <pre> that we prepended.
-                    got.append("</pre collector:deleteme>")
-                    inpre = 0
-
-            # Check line for toggling of inpre.
-            # XXX We don't deal well with way imbalanced pres on a
-            # single line.  Feh, we're working too hard, already.
-            if not inpre:
-                x = presplit(i)
-                if len(x) > 1 and not unprexpsearch(x[-1]):
-                    # The line has a <pre> without subsequent </pre>
-                    inpre = 1
-            else:                   # in <pre>
-                x = unpresplit(i)
-                if len(x) > 1 and not prexpsearch(x[-1]):
-                    # The line has a </pre> without subsequent <pre>
-                    inpre = 0
-
-        else:
-            # Quote the minimal set of chars, to reduce raw text
-            # ugliness. Do the '&' *before* any others that include '&'s!
-            if '&' in i and ';' in i: i = string.replace(i, '&', '&amp;')
-            if '<' in i: i = string.replace(i, '<', '&lt;')
-            if not incited:
-                incited = 1
-                if not inpre:
-                    got.append("<pre collector:deleteme>")
-                    inpre = 1
-        got.append(i)
-    return string.strip(string.join(got, '\n'))
-
-
-def unprocess_comments(text):
-    """Invert the process_comment transformations to yield literal text.
-
-    Specifically, remove (special) <pre>/</pre> and turn the small set of
-    character entities back to characters."""
-    
-    if text.find("<pre collector:deleteme>\n"):
-        text = text.replace("<pre collector:deleteme>\n", '')
-    if text.find("</pre collector:deleteme>\n"):
-        text = text.replace("</pre collector:deleteme>\n", '')
-    if text.find('&amp;'):
-        text = text.replace('&amp;', '&')
-    if text.find('&lt;'):
-        text = text.replace('&lt;', '<')
-    return text
-
 def sorted(l):
     x = list(l[:])
     x.sort()
     return x
-
-urlchars  = (r'[A-Za-z0-9/:@_%~#=&\.\-\?]+')
-nonpuncurlchars  = (r'[A-Za-z0-9/:@_%~#=&\-]')
-url       = (r'["=]?((http|https|ftp|mailto|file|about):%s%s)'
-             % (urlchars, nonpuncurlchars))
-urlexp    = re.compile(url)
-UPLOAD_PREFIX = "Uploaded: "
-uploadexp = re.compile('(%s)([^<,\n]*)([<,\n])' % UPLOAD_PREFIX, re.MULTILINE)
-excludeexpr=re.compile(r'(<a [^>]*href=[^>]+>[^<>]*</a>|<img [^>]*src=[^>]*>)')
-
-def link_candidates(text):
-    """Return match objs for URLS, excluding targets contained in tags.
-
-    (See the excludeexpr for the exact excluded tag contexts.)"""
-
-    excludes = list_search_hits(text, excludeexpr)
-    candidates = list_search_hits(text, urlexp)
-    got = []
-    for c in candidates:
-        cstart, cend = c.start(), c.end()
-        good = 1
-        while excludes:
-            e = excludes[0]
-            if e.end() < cend:
-                # Ditch exclude_match that's prior to remaining candidates.
-                del excludes[0]
-                continue
-            elif e.start() > cstart:
-                # Remaining segments are after candidate - it passes.
-                good = 1
-                break
-            else:
-                # Candidate is contained in segment - baad.
-                good = 0
-                break
-
-        if good:
-            got.append(c)
-
-    return got
-    
-def list_search_hits(text, exprobj):
-    """Return a list of match objects for non-overlapping text hits."""
-    cursor = 0
-    got = []
-    while 1:
-        hit = exprobj.search(text, cursor)
-        if hit:
-            cursor = hit.end()
-            got.append(hit)
-        else:
-            break
-    return got