[CMF-checkins] CVS: CMF/CMFDefault - utils.py:1.17

Yvo Schubbe schubbe@web.de
Wed, 5 Feb 2003 12:51:41 -0500


Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv4163/CMFDefault

Modified Files:
	utils.py 
Log Message:
Merged yuppie-collector041-branch:
- Changed behavior of bodyfinder and html_headcheck.
- Fixed header stripping in edit and PUT. (Collector #41)

=== CMF/CMFDefault/utils.py 1.16 => 1.17 ===
--- CMF/CMFDefault/utils.py:1.16	Thu Dec 19 00:34:55 2002
+++ CMF/CMFDefault/utils.py	Wed Feb  5 12:51:39 2003
@@ -1,6 +1,21 @@
+##############################################################################
+#
+# Copyright (c) 2001-2003 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+""" Utility functions.
+
+$Id$
 """
-    Utility functions.
-"""
+
 from sgmllib import SGMLParser
 import re
 import os
@@ -359,44 +374,38 @@
     else:
         return 1
 
-security.declarePrivate('_bodyre')
-_bodyre = re.compile( r'^\s*<html.*<body.*?>', re.DOTALL | re.I )
-
-security.declarePrivate('_endbodyre')
-_endbodyre = re.compile( r'</body', re.DOTALL | re.I )
-
 security.declarePublic('bodyfinder')
-def bodyfinder( text ):
+def bodyfinder(text):
+    """ Return body or unchanged text if no body tags found.
 
-    bod = _bodyre.search( text )
-    if not bod:
+    Always use html_headcheck() first.
+    """
+    lowertext = text.lower()
+    bodystart = lowertext.find('<body')
+    if bodystart == -1:
         return text
-
-    end = _endbodyre.search( text )
-    if not end:
+    bodystart = lowertext.find('>', bodystart) + 1
+    if bodystart == 0:
         return text
-    else:
-        return text[bod.end():end.start()]
+    bodyend = lowertext.rfind('</body>', bodystart)
+    if bodyend == -1:
+        return text
+    return text[bodystart:bodyend]
 
 security.declarePrivate('_htfinder')
-_htfinder = re.compile( r'<html', re.DOTALL | re.I )
+_htfinder = re.compile(r'(\s|(<[^<>]*?>))*<html.*<body.*?>.*</body>',
+                       re.DOTALL)
 
 security.declarePublic('html_headcheck')
-def html_headcheck( html ):
-
+def html_headcheck(html):
     """ Return 'true' if document looks HTML-ish enough.
+
+    If true bodyfinder() will be able to find the HTML body.
     """
-    if not _htfinder.search(html):
+    lowerhtml = html.lower()
+    if lowerhtml.find('<html') == -1:
+        return 0
+    elif _htfinder.match(lowerhtml):
+        return 1
+    else:
         return 0
-
-    lines = re.split(r'[\n\r]+?', html)
-
-    for line in lines:
-        line = line.strip()
-
-        if not line:
-            continue
-        elif line.lower().startswith( '<html' ):
-            return 1
-        elif line[0] != '<':
-            return 0