[CMF-checkins] CVS: CMF/CMFDefault - utils.py:1.16.4.2

Mon, 3 Feb 2003 13:34:03 -0500

Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv11296/CMFDefault

Modified Files:
      Tag: yuppie-collector041-branch
	utils.py 
Log Message:
third step:
- added tests for xhtml content
- modified tests for bodyfinder
- made new tests pass

=== CMF/CMFDefault/utils.py 1.16.4.1 => 1.16.4.2 ===

--- CMF/CMFDefault/utils.py:1.16.4.1	Sun Feb  2 17:09:27 2003
+++ CMF/CMFDefault/utils.py	Mon Feb  3 13:34:01 2003
@@ -374,41 +374,38 @@
     else:
         return 1
 
-security.declarePrivate('_bodyre')
-_bodyre = re.compile( r'^\s*<html.*<body.*?>', re.DOTALL | re.I )
-
-security.declarePrivate('_endbodyre')
-_endbodyre = re.compile( r'</body', re.DOTALL | re.I )
-
 security.declarePublic('bodyfinder')
-def bodyfinder( text ):
+def bodyfinder(text):
+    """ Return body or unchanged text if no body tags found.
 
-    bod = _bodyre.search( text )
-    if not bod:
+    Always use html_headcheck() first.
+    """
+    lowertext = text.lower()
+    bodystart = lowertext.find('<body')
+    if bodystart == -1:
         return text
-
-    end = _endbodyre.search( text )
-    if not end:
+    bodystart = lowertext.find('>', bodystart) + 1
+    if bodystart == 0:
         return text
-    else:
-        return text[bod.end():end.start()]
+    bodyend = lowertext.rfind('</body>', bodystart)
+    if bodyend == -1:
+        return text
+    return text[bodystart:bodyend]
+
+security.declarePrivate('_htfinder')
+_htfinder = re.compile(r'(\s|(<[^<>]*?>))*<html.*<body.*?>.*</body>',
+                       re.DOTALL)
 
 security.declarePublic('html_headcheck')
-def html_headcheck( html ):
+def html_headcheck(html):
     """ Return 'true' if document looks HTML-ish enough.
+
+    If true bodyfinder() will be able to find the HTML body.
     """
     lowerhtml = html.lower()
     if lowerhtml.find('<html') == -1:
         return 0
-
-    lines = re.split(r'[\n\r]+?', html)
-
-    for line in lines:
-        line = line.strip()
-
-        if not line:
-            continue
-        elif line.lower().startswith( '<html' ):
-            return 1
-        elif line[0] != '<':
-            return 0
+    elif _htfinder.match(lowerhtml):
+        return 1
+    else:
+        return 0