[Zope3-checkins] SVN: Zope3/branches/adamg-mechanize-update/src/mechanize/_ untested

Adam Groszer agroszer at gmail.com
Fri Jul 13 09:21:23 EDT 2007


Log message for revision 77856:
  untested

Changed:
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
  A   Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py
  D   Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py
  U   Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py

-=-
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,3 +1,87 @@
+__all__ = [
+    'AbstractBasicAuthHandler',
+    'AbstractDigestAuthHandler',
+    'BaseHandler',
+    'Browser',
+    'BrowserStateError',
+    'CacheFTPHandler',
+    'ContentTooShortError',
+    'Cookie',
+    'CookieJar',
+    'CookiePolicy',
+    'DefaultCookiePolicy',
+    'DefaultFactory',
+    'FTPHandler',
+    'Factory',
+    'FileCookieJar',
+    'FileHandler',
+    'FormNotFoundError',
+    'FormsFactory',
+    'GopherError',
+    'GopherHandler',
+    'HTTPBasicAuthHandler',
+    'HTTPCookieProcessor',
+    'HTTPDefaultErrorHandler',
+    'HTTPDigestAuthHandler',
+    'HTTPEquivProcessor',
+    'HTTPError',
+    'HTTPErrorProcessor',
+    'HTTPHandler',
+    'HTTPPasswordMgr',
+    'HTTPPasswordMgrWithDefaultRealm',
+    'HTTPProxyPasswordMgr',
+    'HTTPRedirectDebugProcessor',
+    'HTTPRedirectHandler',
+    'HTTPRefererProcessor',
+    'HTTPRefreshProcessor',
+    'HTTPRequestUpgradeProcessor',
+    'HTTPResponseDebugProcessor',
+    'HTTPRobotRulesProcessor',
+    'HTTPSClientCertMgr',
+    'HTTPSHandler',
+    'HeadParser',
+    'History',
+    'LWPCookieJar',
+    'Link',
+    'LinkNotFoundError',
+    'LinksFactory',
+    'LoadError',
+    'MSIECookieJar',
+    'MozillaCookieJar',
+    'OpenerDirector',
+    'OpenerFactory',
+    'ParseError',
+    'ProxyBasicAuthHandler',
+    'ProxyDigestAuthHandler',
+    'ProxyHandler',
+    'Request',
+    'ResponseUpgradeProcessor',
+    'RobotExclusionError',
+    'RobustFactory',
+    'RobustFormsFactory',
+    'RobustLinksFactory',
+    'RobustTitleFactory',
+    'SeekableProcessor',
+    'SeekableResponseOpener',
+    'TitleFactory',
+    'URLError',
+    'USE_BARE_EXCEPT',
+    'UnknownHandler',
+    'UserAgent',
+    'UserAgentBase',
+    'XHTMLCompatibleHeadParser',
+    '__version__',
+    'build_opener',
+    'install_opener',
+    'lwp_cookie_str',
+    'make_response',
+    'request_host',
+    'response_seek_wrapper',  # XXX deprecate in public interface?
+    'seek_wrapped_response'   # XXX should probably use this internally in place of response_seek_wrapper()
+    'str2time',
+    'urlopen',
+    'urlretrieve']
+
 from _mechanize import __version__
 
 # high-level stateful browser-style interface
@@ -2,8 +86,9 @@
 from _mechanize import \
-     Browser, \
+     Browser, History, \
      BrowserStateError, LinkNotFoundError, FormNotFoundError
 
 # configurable URL-opener interface
-from _useragent import UserAgent
+from _useragent import UserAgentBase, UserAgent
 from _html import \
+     ParseError, \
      Link, \
@@ -14,19 +99,20 @@
      RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
 
 # urllib2 work-alike interface (part from mechanize, part from urllib2)
+# This is a superset of the urllib2 interface.
 from _urllib2 import *
 
 # misc
+from _opener import ContentTooShortError, OpenerFactory, urlretrieve
 from _util import http2time as str2time
-from _util import response_seek_wrapper, make_response
-from _urllib2_support import HeadParser
+from _response import \
+     response_seek_wrapper, seek_wrapped_response, make_response
+from _http import HeadParser
 try:
-    from _urllib2_support import XHTMLCompatibleHeadParser
+    from _http import XHTMLCompatibleHeadParser
 except ImportError:
     pass
-#from _gzip import HTTPGzipProcessor  # crap ATM
 
-
 # cookies
 from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
      CookieJar, FileCookieJar, LoadError, request_host

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -11,10 +11,11 @@
 
 """
 
-import re, base64, urlparse, posixpath, md5, sha
+import re, base64, urlparse, posixpath, md5, sha, sys, copy
 
 from urllib2 import BaseHandler
-from urllib import getproxies, unquote, splittype, splituser, splitpasswd
+from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
+     splitport
 
 
 def _parse_proxy(proxy):
@@ -135,32 +136,45 @@
         # uri could be a single URI or a sequence
         if isinstance(uri, basestring):
             uri = [uri]
-        uri = tuple(map(self.reduce_uri, uri))
         if not realm in self.passwd:
             self.passwd[realm] = {}
-        self.passwd[realm][uri] = (user, passwd)
+        for default_port in True, False:
+            reduced_uri = tuple(
+                [self.reduce_uri(u, default_port) for u in uri])
+            self.passwd[realm][reduced_uri] = (user, passwd)
 
     def find_user_password(self, realm, authuri):
         domains = self.passwd.get(realm, {})
-        authuri = self.reduce_uri(authuri)
-        for uris, authinfo in domains.iteritems():
-            for uri in uris:
-                if self.is_suburi(uri, authuri):
-                    return authinfo
+        for default_port in True, False:
+            reduced_authuri = self.reduce_uri(authuri, default_port)
+            for uris, authinfo in domains.iteritems():
+                for uri in uris:
+                    if self.is_suburi(uri, reduced_authuri):
+                        return authinfo
         return None, None
 
-    def reduce_uri(self, uri):
-        """Accept netloc or URI and extract only the netloc and path"""
+    def reduce_uri(self, uri, default_port=True):
+        """Accept authority or URI and extract only the authority and path."""
+        # note HTTP URLs do not have a userinfo component
         parts = urlparse.urlsplit(uri)
         if parts[1]:
             # URI
-            return parts[1], parts[2] or '/'
-        elif parts[0]:
-            # host:port
-            return uri, '/'
+            scheme = parts[0]
+            authority = parts[1]
+            path = parts[2] or '/'
         else:
-            # host
-            return parts[2], '/'
+            # host or host:port
+            scheme = None
+            authority = uri
+            path = '/'
+        host, port = splitport(authority)
+        if default_port and port is None and scheme is not None:
+            dport = {"http": 80,
+                     "https": 443,
+                     }.get(scheme)
+            if dport is not None:
+                authority = "%s:%d" % (host, dport)
+        return authority, path
 
     def is_suburi(self, base, test):
         """Check if test is below base in a URI tree
@@ -220,8 +234,10 @@
             auth = 'Basic %s' % base64.encodestring(raw).strip()
             if req.headers.get(self.auth_header, None) == auth:
                 return None
-            req.add_header(self.auth_header, auth)
-            return self.parent.open(req)
+            newreq = copy.copy(req)
+            newreq.add_header(self.auth_header, auth)
+            newreq.visit = False
+            return self.parent.open(newreq)
         else:
             return None
 
@@ -311,9 +327,10 @@
             auth_val = 'Digest %s' % auth
             if req.headers.get(self.auth_header, None) == auth_val:
                 return None
-            req.add_unredirected_header(self.auth_header, auth_val)
-            resp = self.parent.open(req)
-            return resp
+            newreq = copy.copy(req)
+            newreq.add_unredirected_header(self.auth_header, auth_val)
+            newreq.visit = False
+            return self.parent.open(newreq)
 
     def get_cnonce(self, nonce):
         # The cnonce-value is an opaque
@@ -404,6 +421,7 @@
     """
 
     auth_header = 'Authorization'
+    handler_order = 490
 
     def http_error_401(self, req, fp, code, msg, headers):
         host = urlparse.urlparse(req.get_full_url())[1]
@@ -416,6 +434,7 @@
 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 
     auth_header = 'Proxy-Authorization'
+    handler_order = 490
 
     def http_error_407(self, req, fp, code, msg, headers):
         host = req.get_host()
@@ -425,7 +444,7 @@
         return retry
 
 
-
+# XXX ugly implementation, should probably not bother deriving
 class HTTPProxyPasswordMgr(HTTPPasswordMgr):
     # has default realm and host/port
     def add_password(self, realm, uri, user, passwd):
@@ -436,32 +455,34 @@
             uris = uri
         passwd_by_domain = self.passwd.setdefault(realm, {})
         for uri in uris:
-            uri = self.reduce_uri(uri)
-            passwd_by_domain[uri] = (user, passwd)
+            for default_port in True, False:
+                reduced_uri = self.reduce_uri(uri, default_port)
+                passwd_by_domain[reduced_uri] = (user, passwd)
 
     def find_user_password(self, realm, authuri):
-        perms = [(realm, authuri), (None, authuri)]
+        attempts = [(realm, authuri), (None, authuri)]
         # bleh, want default realm to take precedence over default
         # URI/authority, hence this outer loop
         for default_uri in False, True:
-            for realm, authuri in perms:
+            for realm, authuri in attempts:
                 authinfo_by_domain = self.passwd.get(realm, {})
-                reduced_authuri = self.reduce_uri(authuri)
-                for uri, authinfo in authinfo_by_domain.iteritems():
-                    if uri is None and not default_uri:
-                        continue
-                    if self.is_suburi(uri, reduced_authuri):
-                        return authinfo
-                user, password = None, None
+                for default_port in True, False:
+                    reduced_authuri = self.reduce_uri(authuri, default_port)
+                    for uri, authinfo in authinfo_by_domain.iteritems():
+                        if uri is None and not default_uri:
+                            continue
+                        if self.is_suburi(uri, reduced_authuri):
+                            return authinfo
+                    user, password = None, None
 
-                if user is not None:
-                    break
+                    if user is not None:
+                        break
         return user, password
 
-    def reduce_uri(self, uri):
+    def reduce_uri(self, uri, default_port=True):
         if uri is None:
             return None
-        return HTTPPasswordMgr.reduce_uri(self, uri)
+        return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
 
     def is_suburi(self, base, test):
         if base is None:
@@ -469,3 +490,11 @@
             hostport, path = test
             base = (hostport, "/")
         return HTTPPasswordMgr.is_suburi(self, base, test)
+
+
+class HTTPSClientCertMgr(HTTPPasswordMgr):
+    # implementation inheritance: this is not a proper subclass
+    def add_key_cert(self, uri, key_file, cert_file):
+        self.add_password(None, uri, key_file, cert_file)
+    def find_key_cert(self, authuri):
+        return HTTPPasswordMgr.find_user_password(self, None, authuri)

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,1080 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+v2.1.1
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
+into a tree representation. It provides methods and Pythonic idioms
+that make it easy to search and modify the tree.
+
+A well-formed XML/HTML document will yield a well-formed data
+structure. An ill-formed XML/HTML document will yield a
+correspondingly ill-formed data structure. If your document is only
+locally well-formed, you can use this library to find and process the
+well-formed part of it. The BeautifulSoup class has heuristics for
+obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup has no external dependencies. It works with Python 2.2
+and up.
+
+Beautiful Soup defines classes for four different parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+   language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+   or invalid.
+
+ * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
+   that trips up BeautifulSoup.
+
+ * BeautifulSOAP, for making it easier to parse XML documents that use
+   lots of subelements containing a single string, where you'd prefer
+   they put that string into an attribute (such as SOAP messages).
+
+You can subclass BeautifulStoneSoup or BeautifulSoup to create a
+parsing strategy specific to an XML schema or a particular bizarre
+HTML document. Typically your subclass would just override
+SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr at segfault.org)"
+__version__ = "2.1.1"
+__date__ = "$Date$"
+__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
+__license__ = "PSF"
+
+from sgmllib import SGMLParser, SGMLParseError
+import types
+import re
+import sgmllib
+
+#This code makes Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+
+class NullType(object):
+
+    """Similar to NoneType with a corresponding singleton instance
+    'Null' that, unlike None, accepts any message and returns itself.
+
+    Examples:
+    >>> Null("send", "a", "message")("and one more",
+    ...      "and what you get still") is Null
+    True
+    """
+
+    def __new__(cls):                    return Null
+    def __call__(self, *args, **kwargs): return Null
+##    def __getstate__(self, *args):       return Null
+    def __getattr__(self, attr):         return Null
+    def __getitem__(self, item):         return Null
+    def __setattr__(self, attr, value):  pass
+    def __setitem__(self, item, value):  pass
+    def __len__(self):                   return 0
+    # FIXME: is this a python bug? otherwise ``for x in Null: pass``
+    #        never terminates...
+    def __iter__(self):                  return iter([])
+    def __contains__(self, item):        return False
+    def __repr__(self):                  return "Null"
+Null = object.__new__(NullType)
+
+class PageElement:
+    """Contains the navigational information for some part of the page
+    (either a tag or a piece of text)"""
+
+    def setup(self, parent=Null, previous=Null):
+        """Sets up the initial relations between this element and
+        other elements."""
+        self.parent = parent
+        self.previous = previous
+        self.next = Null
+        self.previousSibling = Null
+        self.nextSibling = Null
+        if self.parent and self.parent.contents:
+            self.previousSibling = self.parent.contents[-1]
+            self.previousSibling.nextSibling = self
+
+    def findNext(self, name=None, attrs={}, text=None):
+        """Returns the first item that matches the given criteria and
+        appears after this Tag in the document."""
+        return self._first(self.fetchNext, name, attrs, text)
+    firstNext = findNext
+
+    def fetchNext(self, name=None, attrs={}, text=None, limit=None):
+        """Returns all items that match the given criteria and appear
+        before after Tag in the document."""
+        return self._fetch(name, attrs, text, limit, self.nextGenerator)
+
+    def findNextSibling(self, name=None, attrs={}, text=None):
+        """Returns the closest sibling to this Tag that matches the
+        given criteria and appears after this Tag in the document."""
+        return self._first(self.fetchNextSiblings, name, attrs, text)
+    firstNextSibling = findNextSibling
+
+    def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
+        """Returns the siblings of this Tag that match the given
+        criteria and appear after this Tag in the document."""
+        return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
+
+    def findPrevious(self, name=None, attrs={}, text=None):
+        """Returns the first item that matches the given criteria and
+        appears before this Tag in the document."""
+        return self._first(self.fetchPrevious, name, attrs, text)
+
+    def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
+        """Returns all items that match the given criteria and appear
+        before this Tag in the document."""
+        return self._fetch(name, attrs, text, limit, self.previousGenerator)
+    firstPrevious = findPrevious
+
+    def findPreviousSibling(self, name=None, attrs={}, text=None):
+        """Returns the closest sibling to this Tag that matches the
+        given criteria and appears before this Tag in the document."""
+        return self._first(self.fetchPreviousSiblings, name, attrs, text)
+    firstPreviousSibling = findPreviousSibling
+
+    def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
+                              limit=None):
+        """Returns the siblings of this Tag that match the given
+        criteria and appear before this Tag in the document."""
+        return self._fetch(name, attrs, text, limit,
+                           self.previousSiblingGenerator)
+
+    def findParent(self, name=None, attrs={}):
+        """Returns the closest parent of this Tag that matches the given
+        criteria."""
+        r = Null
+        l = self.fetchParents(name, attrs, 1)
+        if l:
+            r = l[0]
+        return r
+    firstParent = findParent
+
+    def fetchParents(self, name=None, attrs={}, limit=None):
+        """Returns the parents of this Tag that match the given
+        criteria."""
+        return self._fetch(name, attrs, None, limit, self.parentGenerator)
+
+    #These methods do the real heavy lifting.
+
+    def _first(self, method, name, attrs, text):
+        r = Null
+        l = method(name, attrs, text, 1)
+        if l:
+            r = l[0]
+        return r
+    
+    def _fetch(self, name, attrs, text, limit, generator):
+        "Iterates over a generator looking for things that match."
+        if not hasattr(attrs, 'items'):
+            attrs = {'class' : attrs}
+
+        results = []
+        g = generator()
+        while True:
+            try:
+                i = g.next()
+            except StopIteration:
+                break
+            found = None
+            if isinstance(i, Tag):
+                if not text:
+                    if not name or self._matches(i, name):
+                        match = True
+                        for attr, matchAgainst in attrs.items():
+                            check = i.get(attr)
+                            if not self._matches(check, matchAgainst):
+                                match = False
+                                break
+                        if match:
+                            found = i
+            elif text:
+                if self._matches(i, text):
+                    found = i                    
+            if found:
+                results.append(found)
+                if limit and len(results) >= limit:
+                    break
+        return results
+
+    #Generators that can be used to navigate starting from both
+    #NavigableTexts and Tags.                
+    def nextGenerator(self):
+        i = self
+        while i:
+            i = i.next
+            yield i
+
+    def nextSiblingGenerator(self):
+        i = self
+        while i:
+            i = i.nextSibling
+            yield i
+
+    def previousGenerator(self):
+        i = self
+        while i:
+            i = i.previous
+            yield i
+
+    def previousSiblingGenerator(self):
+        i = self
+        while i:
+            i = i.previousSibling
+            yield i
+
+    def parentGenerator(self):
+        i = self
+        while i:
+            i = i.parent
+            yield i
+
+    def _matches(self, chunk, howToMatch):
+        #print 'looking for %s in %s' % (howToMatch, chunk)
+        #
+        # If given a list of items, return true if the list contains a
+        # text element that matches.
+        if isList(chunk) and not isinstance(chunk, Tag):
+            for tag in chunk:
+                if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
+                    return True
+            return False
+        if callable(howToMatch):
+            return howToMatch(chunk)
+        if isinstance(chunk, Tag):
+            #Custom match methods take the tag as an argument, but all other
+            #ways of matching match the tag name as a string
+            chunk = chunk.name
+        #Now we know that chunk is a string
+        if not isinstance(chunk, basestring):
+            chunk = str(chunk)
+        if hasattr(howToMatch, 'match'):
+            # It's a regexp object.
+            return howToMatch.search(chunk)
+        if isList(howToMatch):
+            return chunk in howToMatch
+        if hasattr(howToMatch, 'items'):
+            return howToMatch.has_key(chunk)
+        #It's just a string
+        return str(howToMatch) == chunk
+
+class NavigableText(PageElement):
+
+    def __getattr__(self, attr):
+        "For backwards compatibility, text.string gives you text"
+        if attr == 'string':
+            return self
+        else:
+            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+        
+class NavigableString(str, NavigableText):
+    pass
+
+class NavigableUnicodeString(unicode, NavigableText):
+    pass
+
+class Tag(PageElement):
+
+    """Represents a found HTML tag with its attributes and contents."""
+
+    def __init__(self, name, attrs=None, parent=Null, previous=Null):
+        "Basic constructor."
+        self.name = name
+        if attrs == None:
+            attrs = []
+        self.attrs = attrs
+        self.contents = []
+        self.setup(parent, previous)
+        self.hidden = False
+
+    def get(self, key, default=None):
+        """Returns the value of the 'key' attribute for the tag, or
+        the value given for 'default' if it doesn't have that
+        attribute."""
+        return self._getAttrMap().get(key, default)    
+
+    def __getitem__(self, key):
+        """tag[key] returns the value of the 'key' attribute for the tag,
+        and throws an exception if it's not there."""
+        return self._getAttrMap()[key]
+
+    def __iter__(self):
+        "Iterating over a tag iterates over its contents."
+        return iter(self.contents)
+
+    def __len__(self):
+        "The length of a tag is the length of its list of contents."
+        return len(self.contents)
+
+    def __contains__(self, x):
+        return x in self.contents
+
+    def __nonzero__(self):
+        "A tag is non-None even if it has no contents."
+        return True
+
+    def __setitem__(self, key, value):        
+        """Setting tag[key] sets the value of the 'key' attribute for the
+        tag."""
+        self._getAttrMap()
+        self.attrMap[key] = value
+        found = False
+        for i in range(0, len(self.attrs)):
+            if self.attrs[i][0] == key:
+                self.attrs[i] = (key, value)
+                found = True
+        if not found:
+            self.attrs.append((key, value))
+        self._getAttrMap()[key] = value
+
+    def __delitem__(self, key):
+        "Deleting tag[key] deletes all 'key' attributes for the tag."
+        for item in self.attrs:
+            if item[0] == key:
+                self.attrs.remove(item)
+                #We don't break because bad HTML can define the same
+                #attribute multiple times.
+            self._getAttrMap()
+            if self.attrMap.has_key(key):
+                del self.attrMap[key]
+
+    def __call__(self, *args, **kwargs):
+        """Calling a tag like a function is the same as calling its
+        fetch() method. Eg. tag('a') returns a list of all the A tags
+        found within this tag."""
+        return apply(self.fetch, args, kwargs)
+
+    def __getattr__(self, tag):
+        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+            return self.first(tag[:-3])
+        elif tag.find('__') != 0:
+            return self.first(tag)
+
+    def __eq__(self, other):
+        """Returns true iff this tag has the same name, the same attributes,
+        and the same contents (recursively) as the given tag.
+
+        NOTE: right now this will return false if two tags have the
+        same attributes in a different order. Should this be fixed?"""
+        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+            return False
+        for i in range(0, len(self.contents)):
+            if self.contents[i] != other.contents[i]:
+                return False
+        return True
+
+    def __ne__(self, other):
+        """Returns true iff this tag is not identical to the other tag,
+        as defined in __eq__."""
+        return not self == other
+
+    def __repr__(self):
+        """Renders this tag as a string."""
+        return str(self)
+
+    def __unicode__(self):
+        return self.__str__(1)
+
+    def __str__(self, needUnicode=None, showStructureIndent=None):
+        """Returns a string or Unicode representation of this tag and
+        its contents.
+
+        NOTE: since Python's HTML parser consumes whitespace, this
+        method is not certain to reproduce the whitespace present in
+        the original string."""
+        
+        attrs = []
+        if self.attrs:
+            for key, val in self.attrs:
+                attrs.append('%s="%s"' % (key, val))
+        close = ''
+        closeTag = ''
+        if self.isSelfClosing():
+            close = ' /'
+        else:
+            closeTag = '</%s>' % self.name
+        indentIncrement = None        
+        if showStructureIndent != None:
+            indentIncrement = showStructureIndent
+            if not self.hidden:
+                indentIncrement += 1
+        contents = self.renderContents(indentIncrement, needUnicode=needUnicode)        
+        if showStructureIndent:
+            space = '\n%s' % (' ' * showStructureIndent)
+        if self.hidden:
+            s = contents
+        else:
+            s = []
+            attributeString = ''
+            if attrs:
+                attributeString = ' ' + ' '.join(attrs)            
+            if showStructureIndent:
+                s.append(space)
+            s.append('<%s%s%s>' % (self.name, attributeString, close))
+            s.append(contents)
+            if closeTag and showStructureIndent != None:
+                s.append(space)
+            s.append(closeTag)
+            s = ''.join(s)
+        isUnicode = type(s) == types.UnicodeType
+        if needUnicode and not isUnicode:
+            s = unicode(s)
+        elif isUnicode and needUnicode==False:
+            s = str(s)
+        return s
+
+    def prettify(self, needUnicode=None):
+        return self.__str__(needUnicode, showStructureIndent=True)
+
+    def renderContents(self, showStructureIndent=None, needUnicode=None):
+        """Renders the contents of this tag as a (possibly Unicode) 
+        string."""
+        s=[]
+        for c in self:
+            text = None
+            if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
+                text = unicode(c)
+            elif isinstance(c, Tag):
+                s.append(c.__str__(needUnicode, showStructureIndent))
+            elif needUnicode:
+                text = unicode(c)
+            else:
+                text = str(c)
+            if text:
+                if showStructureIndent != None:
+                    if text[-1] == '\n':
+                        text = text[:-1]
+                s.append(text)
+        return ''.join(s)    
+
+    #Soup methods
+
+    def firstText(self, text, recursive=True):
+        """Convenience method to retrieve the first piece of text matching the
+        given criteria. 'text' can be a string, a regular expression object,
+        a callable that takes a string and returns whether or not the
+        string 'matches', etc."""
+        return self.first(recursive=recursive, text=text)
+
+    def fetchText(self, text, recursive=True, limit=None):
+        """Convenience method to retrieve all pieces of text matching the
+        given criteria. 'text' can be a string, a regular expression object,
+        a callable that takes a string and returns whether or not the
+        string 'matches', etc."""
+        return self.fetch(recursive=recursive, text=text, limit=limit)
+
+    def first(self, name=None, attrs={}, recursive=True, text=None):
+        """Return only the first child of this
+        Tag matching the given criteria."""
+        r = Null
+        l = self.fetch(name, attrs, recursive, text, 1)
+        if l:
+            r = l[0]
+        return r
+    findChild = first
+
+    def fetch(self, name=None, attrs={}, recursive=True, text=None,
+              limit=None):
+        """Extracts a list of Tag objects that match the given
+        criteria.  You can specify the name of the Tag and any
+        attributes you want the Tag to have.
+
+        The value of a key-value pair in the 'attrs' map can be a
+        string, a list of strings, a regular expression object, or a
+        callable that takes a string and returns whether or not the
+        string matches for some custom definition of 'matches'. The
+        same is true of the tag name."""
+        generator = self.recursiveChildGenerator
+        if not recursive:
+            generator = self.childGenerator
+        return self._fetch(name, attrs, text, limit, generator)
+    fetchChildren = fetch
+    
+    #Utility methods
+
+    def isSelfClosing(self):
+        """Returns true iff this is a self-closing tag as defined in the HTML
+        standard.
+
+        TODO: This is specific to BeautifulSoup and its subclasses, but it's
+        used by __str__"""
+        return self.name in BeautifulSoup.SELF_CLOSING_TAGS
+
+    def append(self, tag):
+        """Appends the given tag to the contents of this tag."""
+        self.contents.append(tag)
+
+    #Private methods
+
+    def _getAttrMap(self):
+        """Initializes a map representation of this tag's attributes,
+        if not already initialized."""
+        if not getattr(self, 'attrMap'):
+            self.attrMap = {}
+            for (key, value) in self.attrs:
+                self.attrMap[key] = value 
+        return self.attrMap
+
+    #Generator methods
+    def childGenerator(self):
+        for i in range(0, len(self.contents)):
+            yield self.contents[i]
+        raise StopIteration
+    
+    def recursiveChildGenerator(self):
+        stack = [(self, 0)]
+        while stack:
+            tag, start = stack.pop()
+            if isinstance(tag, Tag):            
+                for i in range(start, len(tag.contents)):
+                    a = tag.contents[i]
+                    yield a
+                    if isinstance(a, Tag) and tag.contents:
+                        if i < len(tag.contents) - 1:
+                            stack.append((tag, i+1))
+                        stack.append((a, 0))
+                        break
+        raise StopIteration
+
+
+def isList(l):
+    """Convenience method that works with all 2.x versions of Python
+    to determine whether or not something is listlike."""
+    return hasattr(l, '__iter__') \
+           or (type(l) in (types.ListType, types.TupleType))
+
+def buildTagMap(default, *args):
+    """Turns a list of maps, lists, or scalars into a single map.
+    Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
+    of lists and partial maps."""
+    built = {}
+    for portion in args:
+        if hasattr(portion, 'items'):
+            #It's a map. Merge it.
+            for k,v in portion.items():
+                built[k] = v
+        elif isList(portion):
+            #It's a list. Map each item to the default.
+            for k in portion:
+                built[k] = default
+        else:
+            #It's a scalar. Map it to the default.
+            built[portion] = default
+    return built
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+    """This class contains the basic parser and fetch code. It defines
+    a parser that knows nothing about tag behavior except for the
+    following:
+   
+      You can't close a tag without closing all the tags it encloses.
+      That is, "<foo><bar></foo>" actually means
+      "<foo><bar></bar></foo>".
+
+    [Another possible explanation is "<foo><bar /></foo>", but since
+    this class defines no SELF_CLOSING_TAGS, it will never use that
+    explanation.]
+
+    This class is useful for parsing XML or made-up markup languages,
+    or when BeautifulSoup makes an assumption counter to what you were
+    expecting."""
+
+    SELF_CLOSING_TAGS = {}
+    NESTABLE_TAGS = {}
+    RESET_NESTING_TAGS = {}
+    QUOTE_TAGS = {}
+
+    #As a public service we will by default silently replace MS smart quotes
+    #and similar characters with their HTML or ASCII equivalents.
+    MS_CHARS = { '\x80' : '&euro;',
+                 '\x81' : ' ',
+                 '\x82' : '&sbquo;',
+                 '\x83' : '&fnof;',
+                 '\x84' : '&bdquo;',
+                 '\x85' : '&hellip;',
+                 '\x86' : '&dagger;',
+                 '\x87' : '&Dagger;',
+                 '\x88' : '&caret;',
+                 '\x89' : '%',
+                 '\x8A' : '&Scaron;',
+                 '\x8B' : '&lt;',
+                 '\x8C' : '&OElig;',
+                 '\x8D' : '?',
+                 '\x8E' : 'Z',
+                 '\x8F' : '?',
+                 '\x90' : '?',
+                 '\x91' : '&lsquo;',
+                 '\x92' : '&rsquo;',
+                 '\x93' : '&ldquo;',
+                 '\x94' : '&rdquo;',
+                 '\x95' : '&bull;',
+                 '\x96' : '&ndash;',
+                 '\x97' : '&mdash;',
+                 '\x98' : '&tilde;',
+                 '\x99' : '&trade;',
+                 '\x9a' : '&scaron;',
+                 '\x9b' : '&gt;',
+                 '\x9c' : '&oelig;',
+                 '\x9d' : '?',
+                 '\x9e' : 'z',
+                 '\x9f' : '&Yuml;',}
+
+    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+                       lambda(x):x.group(1) + ' />'),
+                      (re.compile('<!\s+([^<>]*)>'),
+                       lambda(x):'<!' + x.group(1) + '>'),
+                      (re.compile("([\x80-\x9f])"),
+                       lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
+                      ]
+
+    ROOT_TAG_NAME = '[document]'
+
+    def __init__(self, text=None, avoidParserProblems=True,
+                 initialTextIsEverything=True):
+        """Initialize this as the 'root tag' and feed in any text to
+        the parser.
+
+        NOTE about avoidParserProblems: sgmllib will process most bad
+        HTML, and BeautifulSoup has tricks for dealing with some HTML
+        that kills sgmllib, but Beautiful Soup can nonetheless choke
+        or lose data if your data uses self-closing tags or
+        declarations incorrectly. By default, Beautiful Soup sanitizes
+        its input to avoid the vast majority of these problems. The
+        problems are relatively rare, even in bad HTML, so feel free
+        to pass in False to avoidParserProblems if they don't apply to
+        you, and you'll get better performance. The only reason I have
+        this turned on by default is so I don't get so many tech
+        support questions.
+
+        The two most common instances of invalid HTML that will choke
+        sgmllib are fixed by the default parser massage techniques:
+
+         <br/> (No space between name of closing tag and tag close)
+         <! --Comment--> (Extraneous whitespace in declaration)
+
+        You can pass in a custom list of (RE object, replace method)
+        tuples to get Beautiful Soup to scrub your input the way you
+        want."""
+        Tag.__init__(self, self.ROOT_TAG_NAME)
+        if avoidParserProblems \
+           and not isList(avoidParserProblems):
+            avoidParserProblems = self.PARSER_MASSAGE            
+        self.avoidParserProblems = avoidParserProblems
+        SGMLParser.__init__(self)
+        self.quoteStack = []
+        self.hidden = 1
+        self.reset()
+        if hasattr(text, 'read'):
+            #It's a file-type object.
+            text = text.read()
+        if text:
+            self.feed(text)
+        if initialTextIsEverything:
+            self.done()
+
+    def __getattr__(self, methodName):
+        """This method routes method call requests to either the SGMLParser
+        superclass or the Tag superclass, depending on the method name."""
+        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
+               or methodName.find('do_') == 0:
+            return SGMLParser.__getattr__(self, methodName)
+        elif methodName.find('__') != 0:
+            return Tag.__getattr__(self, methodName)
+        else:
+            raise AttributeError
+
+    def feed(self, text):
+        if self.avoidParserProblems:
+            for fix, m in self.avoidParserProblems:
+                text = fix.sub(m, text)
+        SGMLParser.feed(self, text)
+
+    def done(self):
+        """Called when you're done parsing, so that the unclosed tags can be
+        correctly processed."""
+        self.endData() #NEW
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+            
+    def reset(self):
+        SGMLParser.reset(self)
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)        
+    
+    def popTag(self):
+        tag = self.tagStack.pop()
+        # Tags with just one string-owning child get the child as a
+        # 'string' property, so that soup.tag.string is shorthand for
+        # soup.tag.contents[0]
+        if len(self.currentTag.contents) == 1 and \
+           isinstance(self.currentTag.contents[0], NavigableText):
+            self.currentTag.string = self.currentTag.contents[0]
+
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self):
+        currentData = ''.join(self.currentData)
+        if currentData:
+            if not currentData.strip():
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            c = NavigableString
+            if type(currentData) == types.UnicodeType:
+                c = NavigableUnicodeString
+            o = c(currentData)
+            o.setup(self.currentTag, self.previous)
+            if self.previous:
+                self.previous.next = o
+            self.previous = o
+            self.currentTag.contents.append(o)
+        self.currentData = []
+
+    def _popToTag(self, name, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        if name == self.ROOT_TAG_NAME:
+            return            
+
+        numPops = 0
+        mostRecentTag = None
+        for i in range(len(self.tagStack)-1, 0, -1):
+            if name == self.tagStack[i].name:
+                numPops = len(self.tagStack)-i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag    
+
+    def _smartPop(self, name):
+
+        """We need to pop up to the previous tag of this type, unless
+        one of this tag's nesting reset triggers comes between this
+        tag and the previous tag of this type, OR unless this tag is a
+        generic nesting trigger and another generic nesting trigger
+        comes between this tag and the previous tag of this type.
+
+        Examples:
+         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
+         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
+         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+
+         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+        """
+
+        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+        isNestable = nestingResetTriggers != None
+        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+        popTo = None
+        inclusive = True
+        for i in range(len(self.tagStack)-1, 0, -1):
+            p = self.tagStack[i]
+            if (not p or p.name == name) and not isNestable:
+                #Non-nestable tags get popped to the top or to their
+                #last occurance.
+                popTo = name
+                break
+            if (nestingResetTriggers != None
+                and p.name in nestingResetTriggers) \
+                or (nestingResetTriggers == None and isResetNesting
+                    and self.RESET_NESTING_TAGS.has_key(p.name)):
+                
+                #If we encounter one of the nesting reset triggers
+                #peculiar to this tag, or we encounter another tag
+                #that causes nesting to reset, pop up to but not
+                #including that tag.
+
+                popTo = p.name
+                inclusive = False
+                break
+            p = p.parent
+        if popTo:
+            self._popToTag(popTo, inclusive)
+
+    def unknown_starttag(self, name, attrs, selfClosing=0):
+        #print "Start tag %s" % name
+        if self.quoteStack:
+            #This is not a real tag.
+            #print "<%s> is not real!" % name
+            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+            self.handle_data('<%s%s>' % (name, attrs))
+            return
+        self.endData()
+        if not name in self.SELF_CLOSING_TAGS and not selfClosing:
+            self._smartPop(name)
+        tag = Tag(name, attrs, self.currentTag, self.previous)        
+        if self.previous:
+            self.previous.next = tag
+        self.previous = tag
+        self.pushTag(tag)
+        if selfClosing or name in self.SELF_CLOSING_TAGS:
+            self.popTag()                
+        if name in self.QUOTE_TAGS:
+            #print "Beginning quote (%s)" % name
+            self.quoteStack.append(name)
+            self.literal = 1
+
+    def unknown_endtag(self, name):
+        if self.quoteStack and self.quoteStack[-1] != name:
+            #This is not a real end tag.
+            #print "</%s> is not real!" % name
+            self.handle_data('</%s>' % name)
+            return
+        self.endData()
+        self._popToTag(name)
+        if self.quoteStack and self.quoteStack[-1] == name:
+            self.quoteStack.pop()
+            self.literal = (len(self.quoteStack) > 0)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def handle_pi(self, text):
+        "Propagate processing instructions right through."
+        self.handle_data("<?%s>" % text)
+
+    def handle_comment(self, text):
+        "Propagate comments right through."
+        self.handle_data("<!--%s-->" % text)
+
+    def handle_charref(self, ref):
+        "Propagate char refs right through."
+        self.handle_data('&#%s;' % ref)
+
+    def handle_entityref(self, ref):
+        "Propagate entity refs right through."
+        self.handle_data('&%s;' % ref)
+        
+    def handle_decl(self, data):
+        "Propagate DOCTYPEs and the like right through."
+        self.handle_data('<!%s>' % data)
+
+    def parse_declaration(self, i):
+        """Treat a bogus SGML declaration as raw data. Treat a CDATA
+        declaration as regular data."""
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             self.handle_data(self.rawdata[i+9:k])
+             j = k+3
+        else:
+            try:
+                j = SGMLParser.parse_declaration(self, i)
+            except SGMLParseError:
+                toHandle = self.rawdata[i:]
+                self.handle_data(toHandle)
+                j = i + len(toHandle)
+        return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+    """This parser knows the following facts about HTML:
+
+    * Some tags have no closing tag and should be interpreted as being
+      closed as soon as they are encountered.
+
+    * The text inside some tags (ie. 'script') may contain tags which
+      are not really part of the document and which should be parsed
+      as text, not tags. If you want to parse the text as tags, you can
+      always fetch it and parse it explicitly.
+
+    * Tag nesting rules:
+
+      Most tags can't be nested at all. For instance, the occurance of
+      a <p> tag should implicitly close the previous <p> tag.
+
+       <p>Para1<p>Para2
+        should be transformed into:
+       <p>Para1</p><p>Para2
+
+      Some tags can be nested arbitrarily. For instance, the occurance
+      of a <blockquote> tag should _not_ implicitly close the previous
+      <blockquote> tag.
+
+       Alice said: <blockquote>Bob said: <blockquote>Blah
+        should NOT be transformed into:
+       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+      Some tags can be nested, but the nesting is reset by the
+      interposition of other tags. For instance, a <tr> tag should
+      implicitly close the previous <tr> tag within the same <table>,
+      but not close a <tr> tag in another table.
+
+       <table><tr>Blah<tr>Blah
+        should be transformed into:
+       <table><tr>Blah</tr><tr>Blah
+        but,
+       <tr>Blah<table><tr>Blah
+        should NOT be transformed into
+       <tr>Blah<table></tr><tr>Blah
+
+    Differing assumptions about tag nesting rules are a major source
+    of problems with the BeautifulSoup class. If BeautifulSoup is not
+    treating as nestable a tag your page author treats as nestable,
+    try ICantBelieveItsBeautifulSoup before writing your own
+    subclass."""
+
+    SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
+                                           'spacer', 'link', 'frame', 'base'])
+
+    QUOTE_TAGS = {'script': None}
+    
+    #According to the HTML standard, each of these inline tags can
+    #contain another tag of the same type. Furthermore, it's common
+    #to actually use these tags this way.
+    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+                            'center']
+
+    #According to the HTML standard, these block tags can contain
+    #another tag of the same type. Furthermore, it's common
+    #to actually use these tags this way.
+    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+
+    #Lists can contain other lists, but there are restrictions.    
+    NESTABLE_LIST_TAGS = { 'ol' : [],
+                           'ul' : [],
+                           'li' : ['ul', 'ol'],
+                           'dl' : [],
+                           'dd' : ['dl'],
+                           'dt' : ['dl'] }
+
+    #Tables can contain other tables, but there are restrictions.    
+    NESTABLE_TABLE_TAGS = {'table' : [], 
+                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+                           'td' : ['tr'],
+                           'th' : ['tr'],
+                           }
+
+    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+
+    #If one of these tags is encountered, all tags up to the next tag of
+    #this type are popped.
+    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+                                     NON_NESTABLE_BLOCK_TAGS,
+                                     NESTABLE_LIST_TAGS,
+                                     NESTABLE_TABLE_TAGS)
+
+    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+    
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+    """The BeautifulSoup class is oriented towards skipping over
+    common HTML errors like unclosed tags. However, sometimes it makes
+    errors of its own. For instance, consider this fragment:
+
+     <b>Foo<b>Bar</b></b>
+
+    This is perfectly valid (if bizarre) HTML. However, the
+    BeautifulSoup class will implicitly close the first b tag when it
+    encounters the second 'b'. It will think the author wrote
+    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+    there's no real-world reason to bold something that's already
+    bold. When it encounters '</b></b>' it will close two more 'b'
+    tags, for a grand total of three tags closed instead of two. This
+    can throw off the rest of your document structure. The same is
+    true of a number of other tags, listed below.
+
+    It's much more common for someone to forget to close (eg.) a 'b'
+    tag than to actually use nested 'b' tags, and the BeautifulSoup
+    class handles the common case. This class handles the
+    not-co-common case: where you can't believe someone wrote what
+    they did, but it's valid HTML and BeautifulSoup screwed up by
+    assuming it wouldn't be.
+
+    If this doesn't do what you need, try subclassing this class or
+    BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
+
+    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+      'big']
+
+    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+
+    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class BeautifulSOAP(BeautifulStoneSoup):
+    """This class will push a tag with only a single string child into
+    the tag's parent as an attribute. The attribute's name is the tag
+    name, and the value is the string child. An example should give
+    the flavor of the change:
+
+    <foo><bar>baz</bar></foo>
+     =>
+    <foo bar="baz"><bar>baz</bar></foo>
+
+    You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+    This is, of course, useful for scraping structures that tend to
+    use subelements instead of attributes, such as SOAP messages. Note
+    that it modifies its input, so don't print the modified version
+    out.
+
+    I'm not sure how many people really want to use this class; let me
+    know if you do. Mainly I like the name."""
+
+    def popTag(self):
+        if len(self.tagStack) > 1:
+            tag = self.tagStack[-1]
+            parent = self.tagStack[-2]
+            parent._getAttrMap()
+            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+                isinstance(tag.contents[0], NavigableText) and 
+                not parent.attrMap.has_key(tag.name)):
+                parent[tag.name] = tag.contents[0]
+        BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisitude,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+    pass
+class RobustHTMLParser(BeautifulSoup):
+    pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+    pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+    pass
+
+###
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulStoneSoup(sys.stdin.read())
+    print soup.prettify()


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,4 +1,4 @@
-"""HTTP cookie handling for web clients, plus some other stuff.
+"""HTTP cookie handling for web clients.
 
 This module originally developed from my port of Gisle Aas' Perl module
 HTTP::Cookies, from the libwww-perl library.
@@ -32,7 +32,7 @@
 
 """
 
-import sys, re, urlparse, string, copy, time, struct, urllib, types, logging
+import sys, re, copy, time, struct, urllib, types, logging
 try:
     import threading
     _threading = threading; del threading
@@ -46,7 +46,8 @@
 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
 
 from _headersutil import split_header_words, parse_ns_headers
-from _util import startswith, endswith, isstringlike, getheaders
+from _util import isstringlike
+import _rfc3986
 
 debug = logging.getLogger("mechanize.cookies").debug
 
@@ -105,17 +106,17 @@
     """
     # Note that, if A or B are IP addresses, the only relevant part of the
     # definition of the domain-match algorithm is the direct string-compare.
-    A = string.lower(A)
-    B = string.lower(B)
+    A = A.lower()
+    B = B.lower()
     if A == B:
         return True
     if not is_HDN(A):
         return False
-    i = string.rfind(A, B)
+    i = A.rfind(B)
     has_form_nb = not (i == -1 or i == 0)
     return (
         has_form_nb and
-        startswith(B, ".") and
+        B.startswith(".") and
         is_HDN(B[1:])
         )
 
@@ -133,15 +134,15 @@
     A and B may be host domain names or IP addresses.
 
     """
-    A = string.lower(A)
-    B = string.lower(B)
+    A = A.lower()
+    B = B.lower()
     if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
         if A == B:
             # equal IP addresses
             return True
         return False
-    initial_dot = startswith(B, ".")
-    if initial_dot and endswith(A, B):
+    initial_dot = B.startswith(".")
+    if initial_dot and A.endswith(B):
         return True
     if not initial_dot and A == B:
         return True
@@ -156,13 +157,13 @@
 
     """
     url = request.get_full_url()
-    host = urlparse.urlparse(url)[1]
-    if host == "":
+    host = _rfc3986.urlsplit(url)[1]
+    if host is None:
         host = request.get_header("Host", "")
 
     # remove port, if present
     host = cut_port_re.sub("", host, 1)
-    return string.lower(host)
+    return host.lower()
 
 def eff_request_host(request):
     """Return a tuple (request-host, effective request-host name).
@@ -171,28 +172,23 @@
 
     """
     erhn = req_host = request_host(request)
-    if string.find(req_host, ".") == -1 and not IPV4_RE.search(req_host):
+    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
         erhn = req_host + ".local"
     return req_host, erhn
 
 def request_path(request):
     """request-URI, as defined by RFC 2965."""
     url = request.get_full_url()
-    #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
-    #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
-    path, parameters, query, frag = urlparse.urlparse(url)[2:]
-    if parameters:
-        path = "%s;%s" % (path, parameters)
+    path, query, frag = _rfc3986.urlsplit(url)[2:]
     path = escape_path(path)
-    req_path = urlparse.urlunparse(("", "", path, "", query, frag))
-    if not startswith(req_path, "/"):
-        # fix bad RFC 2396 absoluteURI
+    req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
+    if not req_path.startswith("/"):
         req_path = "/"+req_path
     return req_path
 
 def request_port(request):
     host = request.get_host()
-    i = string.find(host, ':')
+    i = host.find(':')
     if i >= 0:
         port = host[i+1:]
         try:
@@ -209,7 +205,7 @@
 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
 def uppercase_escaped_char(match):
-    return "%%%s" % string.upper(match.group(1))
+    return "%%%s" % match.group(1).upper()
 def escape_path(path):
     """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
     # There's no knowing what character encoding was used to create URLs
@@ -252,11 +248,11 @@
     '.local'
 
     """
-    i = string.find(h, ".")
+    i = h.find(".")
     if i >= 0:
         #a = h[:i]  # this line is only here to show what a is
         b = h[i+1:]
-        i = string.find(b, ".")
+        i = b.find(".")
         if is_HDN(h) and (i >= 0 or b == "local"):
             return "."+b
     return h
@@ -344,7 +340,7 @@
         self.port = port
         self.port_specified = port_specified
         # normalise case, as per RFC 2965 section 3.3.3
-        self.domain = string.lower(domain)
+        self.domain = domain.lower()
         self.domain_specified = domain_specified
         # Sigh.  We need to know whether the domain given in the
         # cookie-attribute had an initial dot, in order to follow RFC 2965
@@ -397,7 +393,7 @@
             args.append("%s=%s" % (name, repr(attr)))
         args.append("rest=%s" % repr(self._rest))
         args.append("rfc2109=%s" % repr(self.rfc2109))
-        return "Cookie(%s)" % string.join(args, ", ")
+        return "Cookie(%s)" % ", ".join(args)
 
 
 class CookiePolicy:
@@ -701,7 +697,7 @@
         # Try and stop servers setting V0 cookies designed to hack other
         # servers that know both V0 and V1 protocols.
         if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
-            startswith(cookie.name, "$")):
+            cookie.name.startswith("$")):
             debug("   illegal name (starts with '$'): '%s'", cookie.name)
             return False
         return True
@@ -711,7 +707,7 @@
             req_path = request_path(request)
             if ((cookie.version > 0 or
                  (cookie.version == 0 and self.strict_ns_set_path)) and
-                not startswith(req_path, cookie.path)):
+                not req_path.startswith(cookie.path)):
                 debug("   path attribute %s is not a prefix of request "
                       "path %s", cookie.path, req_path)
                 return False
@@ -728,12 +724,12 @@
             domain = cookie.domain
             # since domain was specified, we know that:
             assert domain.startswith(".")
-            if string.count(domain, ".") == 2:
+            if domain.count(".") == 2:
                 # domain like .foo.bar
-                i = string.rfind(domain, ".")
+                i = domain.rfind(".")
                 tld = domain[i+1:]
                 sld = domain[1:i]
-                if (string.lower(sld) in [
+                if (sld.lower() in [
                     "co", "ac",
                     "com", "edu", "org", "net", "gov", "mil", "int",
                     "aero", "biz", "cat", "coop", "info", "jobs", "mobi",
@@ -757,19 +753,19 @@
         if cookie.domain_specified:
             req_host, erhn = eff_request_host(request)
             domain = cookie.domain
-            if startswith(domain, "."):
+            if domain.startswith("."):
                 undotted_domain = domain[1:]
             else:
                 undotted_domain = domain
-            embedded_dots = (string.find(undotted_domain, ".") >= 0)
+            embedded_dots = (undotted_domain.find(".") >= 0)
             if not embedded_dots and domain != ".local":
                 debug("   non-local domain %s contains no embedded dot",
                       domain)
                 return False
             if cookie.version == 0:
-                if (not endswith(erhn, domain) and
-                    (not startswith(erhn, ".") and
-                     not endswith("."+erhn, domain))):
+                if (not erhn.endswith(domain) and
+                    (not erhn.startswith(".") and
+                     not ("."+erhn).endswith(domain))):
                     debug("   effective request-host %s (even with added "
                           "initial dot) does not end end with %s",
                           erhn, domain)
@@ -783,7 +779,7 @@
             if (cookie.version > 0 or
                 (self.strict_ns_domain & self.DomainStrictNoDots)):
                 host_prefix = req_host[:-len(domain)]
-                if (string.find(host_prefix, ".") >= 0 and
+                if (host_prefix.find(".") >= 0 and
                     not IPV4_RE.search(req_host)):
                     debug("   host prefix %s for domain %s contains a dot",
                           host_prefix, domain)
@@ -797,7 +793,7 @@
                 req_port = "80"
             else:
                 req_port = str(req_port)
-            for p in string.split(cookie.port, ","):
+            for p in cookie.port.split(","):
                 try:
                     int(p)
                 except ValueError:
@@ -867,7 +863,7 @@
             req_port = request_port(request)
             if req_port is None:
                 req_port = "80"
-            for p in string.split(cookie.port, ","):
+            for p in cookie.port.split(","):
                 if p == req_port:
                     break
             else:
@@ -892,7 +888,7 @@
             debug("   effective request-host name %s does not domain-match "
                   "RFC 2965 cookie domain %s", erhn, domain)
             return False
-        if cookie.version == 0 and not endswith("."+erhn, domain):
+        if cookie.version == 0 and not ("."+erhn).endswith(domain):
             debug("   request-host %s does not match Netscape cookie domain "
                   "%s", req_host, domain)
             return False
@@ -905,12 +901,12 @@
         # Munge req_host and erhn to always start with a dot, so as to err on
         # the side of letting cookies through.
         dotted_req_host, dotted_erhn = eff_request_host(request)
-        if not startswith(dotted_req_host, "."):
+        if not dotted_req_host.startswith("."):
             dotted_req_host = "."+dotted_req_host
-        if not startswith(dotted_erhn, "."):
+        if not dotted_erhn.startswith("."):
             dotted_erhn = "."+dotted_erhn
-        if not (endswith(dotted_req_host, domain) or
-                endswith(dotted_erhn, domain)):
+        if not (dotted_req_host.endswith(domain) or
+                dotted_erhn.endswith(domain)):
             #debug("   request domain %s does not match cookie domain %s",
             #      req_host, domain)
             return False
@@ -927,7 +923,7 @@
     def path_return_ok(self, path, request):
         debug("- checking cookie path=%s", path)
         req_path = request_path(request)
-        if not startswith(req_path, path):
+        if not req_path.startswith(path):
             debug("  %s does not path-match %s", req_path, path)
             return False
         return True
@@ -1096,10 +1092,10 @@
             if version > 0:
                 if cookie.path_specified:
                     attrs.append('$Path="%s"' % cookie.path)
-                if startswith(cookie.domain, "."):
+                if cookie.domain.startswith("."):
                     domain = cookie.domain
                     if (not cookie.domain_initial_dot and
-                        startswith(domain, ".")):
+                        domain.startswith(".")):
                         domain = domain[1:]
                     attrs.append('$Domain="%s"' % domain)
                 if cookie.port is not None:
@@ -1137,8 +1133,7 @@
         attrs = self._cookie_attrs(cookies)
         if attrs:
             if not request.has_header("Cookie"):
-                request.add_unredirected_header(
-                    "Cookie", string.join(attrs, "; "))
+                request.add_unredirected_header("Cookie", "; ".join(attrs))
 
         # if necessary, advertise that we know RFC 2965
         if self._policy.rfc2965 and not self._policy.hide_cookie2:
@@ -1188,7 +1183,7 @@
             standard = {}
             rest = {}
             for k, v in cookie_attrs[1:]:
-                lc = string.lower(k)
+                lc = k.lower()
                 # don't lose case distinction for unknown fields
                 if lc in value_attrs or lc in boolean_attrs:
                     k = lc
@@ -1205,7 +1200,7 @@
                         bad_cookie = True
                         break
                     # RFC 2965 section 3.3.3
-                    v = string.lower(v)
+                    v = v.lower()
                 if k == "expires":
                     if max_age_set:
                         # Prefer max-age to expires (like Mozilla)
@@ -1272,7 +1267,7 @@
         else:
             path_specified = False
             path = request_path(request)
-            i = string.rfind(path, "/")
+            i = path.rfind("/")
             if i != -1:
                 if version == 0:
                     # Netscape spec parts company from reality here
@@ -1286,11 +1281,11 @@
         # but first we have to remember whether it starts with a dot
         domain_initial_dot = False
         if domain_specified:
-            domain_initial_dot = bool(startswith(domain, "."))
+            domain_initial_dot = bool(domain.startswith("."))
         if domain is Absent:
             req_host, erhn = eff_request_host(request)
             domain = erhn
-        elif not startswith(domain, "."):
+        elif not domain.startswith("."):
             domain = "."+domain
 
         # set default port
@@ -1365,8 +1360,8 @@
         """
         # get cookie-attributes for RFC 2965 and Netscape protocols
         headers = response.info()
-        rfc2965_hdrs = getheaders(headers, "Set-Cookie2")
-        ns_hdrs = getheaders(headers, "Set-Cookie")
+        rfc2965_hdrs = headers.getheaders("Set-Cookie2")
+        ns_hdrs = headers.getheaders("Set-Cookie")
 
         rfc2965 = self._policy.rfc2965
         netscape = self._policy.netscape
@@ -1550,12 +1545,12 @@
     def __repr__(self):
         r = []
         for cookie in self: r.append(repr(cookie))
-        return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+        return "<%s[%s]>" % (self.__class__, ", ".join(r))
 
     def __str__(self):
         r = []
         for cookie in self: r.append(str(cookie))
-        return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+        return "<%s[%s]>" % (self.__class__, ", ".join(r))
 
 
 class LoadError(Exception): pass

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,28 @@
+import logging
+
+from urllib2 import BaseHandler
+from _response import response_seek_wrapper
+
+
+class HTTPResponseDebugProcessor(BaseHandler):
+    handler_order = 900  # before redirections, after everything else
+
+    def http_response(self, request, response):
+        if not hasattr(response, "seek"):
+            response = response_seek_wrapper(response)
+        info = logging.getLogger("mechanize.http_responses").info
+        try:
+            info(response.read())
+        finally:
+            response.seek(0)
+        info("*****************************************************")
+        return response
+
+    https_response = http_response
+
+class HTTPRedirectDebugProcessor(BaseHandler):
+    def http_request(self, request):
+        if hasattr(request, "redirect_dict"):
+            info = logging.getLogger("mechanize.http_redirects").info
+            info("redirecting to %s", request.get_full_url())
+        return request


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,6 +1,6 @@
 import urllib2
 from cStringIO import StringIO
-import _util
+import _response
 
 # GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
 class GzipConsumer:
@@ -65,7 +65,7 @@
     def __init__(self): self.data = []
     def feed(self, data): self.data.append(data)
 
-class stupid_gzip_wrapper(_util.closeable_response):
+class stupid_gzip_wrapper(_response.closeable_response):
     def __init__(self, response):
         self._response = response
 

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,12 +9,13 @@
 
 """
 
-import os, re, string, urlparse
+import os, re
 from types import StringType
 from types import UnicodeType
 STRING_TYPES = StringType, UnicodeType
 
-from _util import startswith, endswith, http2time
+from _util import http2time
+import _rfc3986
 
 def is_html(ct_headers, url, allow_xhtml=False):
     """
@@ -24,7 +25,7 @@
     """
     if not ct_headers:
         # guess
-        ext = os.path.splitext(urlparse.urlparse(url)[2])[1]
+        ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
         html_exts = [".htm", ".html"]
         if allow_xhtml:
             html_exts += [".xhtml"]
@@ -113,14 +114,14 @@
                     if m:  # unquoted value
                         text = unmatched(m)
                         value = m.group(1)
-                        value = string.rstrip(value)
+                        value = value.rstrip()
                     else:
                         # no value, a lone token
                         value = None
                 pairs.append((name, value))
-            elif startswith(string.lstrip(text), ","):
+            elif text.lstrip().startswith(","):
                 # concatenated headers, as per RFC 2616 section 4.2
-                text = string.lstrip(text)[1:]
+                text = text.lstrip()[1:]
                 if pairs: result.append(pairs)
                 pairs = []
             else:
@@ -159,8 +160,8 @@
                 else:
                     k = "%s=%s" % (k, v)
             attr.append(k)
-        if attr: headers.append(string.join(attr, "; "))
-    return string.join(headers, ", ")
+        if attr: headers.append("; ".join(attr))
+    return ", ".join(headers)
 
 def parse_ns_headers(ns_headers):
     """Ad-hoc parser for Netscape protocol cookie-attributes.
@@ -188,15 +189,15 @@
         params = re.split(r";\s*", ns_header)
         for ii in range(len(params)):
             param = params[ii]
-            param = string.rstrip(param)
+            param = param.rstrip()
             if param == "": continue
             if "=" not in param:
                 k, v = param, None
             else:
                 k, v = re.split(r"\s*=\s*", param, 1)
-                k = string.lstrip(k)
+                k = k.lstrip()
             if ii != 0:
-                lc = string.lower(k)
+                lc = k.lower()
                 if lc in known_attrs:
                     k = lc
                 if k == "version":
@@ -204,8 +205,8 @@
                     version_set = True
                 if k == "expires":
                     # convert expires date to seconds since epoch
-                    if startswith(v, '"'): v = v[1:]
-                    if endswith(v, '"'): v = v[:-1]
+                    if v.startswith('"'): v = v[1:]
+                    if v.endswith('"'): v = v[:-1]
                     v = http2time(v)  # None if invalid
             pairs.append((k, v))
 

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -8,97 +8,42 @@
 
 """
 
-import re, copy, urllib, htmlentitydefs
-from urlparse import urljoin
+import re, copy, htmlentitydefs
+import sgmllib, HTMLParser, ClientForm
 
 import _request
 from _headersutil import split_header_words, is_html as _is_html
+import _rfc3986
 
-## # XXXX miserable hack
-## def urljoin(base, url):
-##     if url.startswith("?"):
-##         return base+url
-##     else:
-##         return urlparse.urljoin(base, url)
+DEFAULT_ENCODING = "latin-1"
 
-## def chr_range(a, b):
-##     return "".join(map(chr, range(ord(a), ord(b)+1)))
 
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-##                       "abcdefghijklmnopqrstuvwxyz"
-##                       "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+# the base classe is purely for backwards compatibility
+class ParseError(ClientForm.ParseError): pass
 
-DEFAULT_ENCODING = "latin-1"
 
 class CachingGeneratorFunction(object):
-    """Caching wrapper around a no-arguments iterable.
+    """Caching wrapper around a no-arguments iterable."""
 
-    >>> i = [1]
-    >>> func = CachingGeneratorFunction(i)
-    >>> list(func())
-    [1]
-    >>> list(func())
-    [1]
-
-    >>> i = [1, 2, 3]
-    >>> func = CachingGeneratorFunction(i)
-    >>> list(func())
-    [1, 2, 3]
-
-    >>> i = func()
-    >>> i.next()
-    1
-    >>> i.next()
-    2
-    >>> i.next()
-    3
-
-    >>> i = func()
-    >>> j = func()
-    >>> i.next()
-    1
-    >>> j.next()
-    1
-    >>> i.next()
-    2
-    >>> j.next()
-    2
-    >>> j.next()
-    3
-    >>> i.next()
-    3
-    >>> i.next()
-    Traceback (most recent call last):
-    ...
-    StopIteration
-    >>> j.next()
-    Traceback (most recent call last):
-    ...
-    StopIteration
-    """
     def __init__(self, iterable):
-        def make_gen():
-            for item in iterable:
-                yield item
-
         self._cache = []
-        self._generator = make_gen()
+        # wrap iterable to make it non-restartable (otherwise, repeated
+        # __call__ would give incorrect results)
+        self._iterator = iter(iterable)
 
     def __call__(self):
         cache = self._cache
-
         for item in cache:
             yield item
-        for item in self._generator:
+        for item in self._iterator:
             cache.append(item)
             yield item
 
-def encoding_finder(default_encoding):
-    def encoding(response):
+
+class EncodingFinder:
+    def __init__(self, default_encoding):
+        self._default_encoding = default_encoding
+    def encoding(self, response):
         # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
         # headers may be in the response.  HTTP-EQUIV headers come last,
         # so try in order from first to last.
@@ -106,17 +51,18 @@
             for k, v in split_header_words([ct])[0]:
                 if k == "charset":
                     return v
-        return default_encoding
-    return encoding
+        return self._default_encoding
 
-def make_is_html(allow_xhtml):
-    def is_html(response, encoding):
+class ResponseTypeFinder:
+    def __init__(self, allow_xhtml):
+        self._allow_xhtml = allow_xhtml
+    def is_html(self, response, encoding):
         ct_hdrs = response.info().getheaders("content-type")
         url = response.geturl()
         # XXX encoding
-        return _is_html(ct_hdrs, url, allow_xhtml)
-    return is_html
+        return _is_html(ct_hdrs, url, self._allow_xhtml)
 
+
 # idea for this argument-processing trick is from Peter Otten
 class Args:
     def __init__(self, args_map):
@@ -140,7 +86,7 @@
     def __init__(self, base_url, url, text, tag, attrs):
         assert None not in [url, tag, attrs]
         self.base_url = base_url
-        self.absolute_url = urljoin(base_url, url)
+        self.absolute_url = _rfc3986.urljoin(base_url, url)
         self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
     def __cmp__(self, other):
         try:
@@ -155,19 +101,6 @@
             self.base_url, self.url, self.text, self.tag, self.attrs)
 
 
-def clean_url(url, encoding):
-    # percent-encode illegal URL characters
-    # Trying to come up with test cases for this gave me a headache, revisit
-    # when do switch to unicode.
-    # Somebody else's comments (lost the attribution):
-##     - IE will return you the url in the encoding you send it
-##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
-##     characters in your link. It will send you utf-8 however if there are...
-    if type(url) == type(""):
-        url = url.decode(encoding, "replace")
-    url = url.strip()
-    return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
-
 class LinksFactory:
 
     def __init__(self,
@@ -203,40 +136,49 @@
         base_url = self._base_url
         p = self.link_parser_class(response, encoding=encoding)
 
-        for token in p.tags(*(self.urltags.keys()+["base"])):
-            if token.data == "base":
-                base_url = dict(token.attrs).get("href")
-                continue
-            if token.type == "endtag":
-                continue
-            attrs = dict(token.attrs)
-            tag = token.data
-            name = attrs.get("name")
-            text = None
-            # XXX use attr_encoding for ref'd doc if that doc does not provide
-            #  one by other means
-            #attr_encoding = attrs.get("charset")
-            url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
-            if not url:
-                # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
-                # For our purposes a link is something with a URL, so ignore
-                # this.
-                continue
+        try:
+            for token in p.tags(*(self.urltags.keys()+["base"])):
+                if token.type == "endtag":
+                    continue
+                if token.data == "base":
+                    base_href = dict(token.attrs).get("href")
+                    if base_href is not None:
+                        base_url = base_href
+                    continue
+                attrs = dict(token.attrs)
+                tag = token.data
+                name = attrs.get("name")
+                text = None
+                # XXX use attr_encoding for ref'd doc if that doc does not
+                #  provide one by other means
+                #attr_encoding = attrs.get("charset")
+                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
+                if not url:
+                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+                    # For our purposes a link is something with a URL, so
+                    # ignore this.
+                    continue
 
-            url = clean_url(url, encoding)
-            if tag == "a":
-                if token.type != "startendtag":
-                    # hmm, this'd break if end tag is missing
-                    text = p.get_compressed_text(("endtag", tag))
-                # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
-                #text = p.get_compressed_text()
+                url = _rfc3986.clean_url(url, encoding)
+                if tag == "a":
+                    if token.type != "startendtag":
+                        # hmm, this'd break if end tag is missing
+                        text = p.get_compressed_text(("endtag", tag))
+                    # but this doesn't work for eg.
+                    # <a href="blah"><b>Andy</b></a>
+                    #text = p.get_compressed_text()
 
-            yield Link(base_url, url, text, tag, token.attrs)
+                yield Link(base_url, url, text, tag, token.attrs)
+        except sgmllib.SGMLParseError, exc:
+            raise ParseError(exc)
 
 class FormsFactory:
 
     """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
 
+    After calling .forms(), the .global_form attribute is a form object
+    containing all controls not a descendant of any FORM element.
+
     For constructor argument docs, see ClientForm.ParseResponse
     argument docs.
 
@@ -259,22 +201,31 @@
         self.backwards_compat = backwards_compat
         self._response = None
         self.encoding = None
+        self.global_form = None
 
     def set_response(self, response, encoding):
         self._response = response
         self.encoding = encoding
+        self.global_form = None
 
     def forms(self):
         import ClientForm
         encoding = self.encoding
-        return ClientForm.ParseResponse(
-            self._response,
-            select_default=self.select_default,
-            form_parser_class=self.form_parser_class,
-            request_class=self.request_class,
-            backwards_compat=self.backwards_compat,
-            encoding=encoding,
-            )
+        try:
+            forms = ClientForm.ParseResponseEx(
+                self._response,
+                select_default=self.select_default,
+                form_parser_class=self.form_parser_class,
+                request_class=self.request_class,
+                encoding=encoding,
+                _urljoin=_rfc3986.urljoin,
+                _urlparse=_rfc3986.urlsplit,
+                _urlunparse=_rfc3986.urlunsplit,
+                )
+        except ClientForm.ParseError, exc:
+            raise ParseError(exc)
+        self.global_form = forms[0]
+        return forms[1:]
 
 class TitleFactory:
     def __init__(self):
@@ -289,11 +240,14 @@
         p = _pullparser.TolerantPullParser(
             self._response, encoding=self._encoding)
         try:
-            p.get_tag("title")
-        except _pullparser.NoMoreTokensError:
-            return None
-        else:
-            return p.get_text()
+            try:
+                p.get_tag("title")
+            except _pullparser.NoMoreTokensError:
+                return None
+            else:
+                return p.get_text()
+        except sgmllib.SGMLParseError, exc:
+            raise ParseError(exc)
 
 
 def unescape(data, entities, encoding):
@@ -334,42 +288,44 @@
         return repl
 
 
-try:
-    import BeautifulSoup
-except ImportError:
-    pass
-else:
-    import sgmllib
-    # monkeypatch to fix http://www.python.org/sf/803422 :-(
-    sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
-    class MechanizeBs(BeautifulSoup.BeautifulSoup):
-        _entitydefs = htmlentitydefs.name2codepoint
-        # don't want the magic Microsoft-char workaround
-        PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
-                           lambda(x):x.group(1) + ' />'),
-                          (re.compile('<!\s+([^<>]*)>'),
-                           lambda(x):'<!' + x.group(1) + '>')
-                          ]
+# bizarre import gymnastics for bundled BeautifulSoup
+import _beautifulsoup
+import ClientForm
+RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
+    _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
+    )
+# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
+import sgmllib
+sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
 
-        def __init__(self, encoding, text=None, avoidParserProblems=True,
-                     initialTextIsEverything=True):
-            self._encoding = encoding
-            BeautifulSoup.BeautifulSoup.__init__(
-                self, text, avoidParserProblems, initialTextIsEverything)
+class MechanizeBs(_beautifulsoup.BeautifulSoup):
+    _entitydefs = htmlentitydefs.name2codepoint
+    # don't want the magic Microsoft-char workaround
+    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+                       lambda(x):x.group(1) + ' />'),
+                      (re.compile('<!\s+([^<>]*)>'),
+                       lambda(x):'<!' + x.group(1) + '>')
+                      ]
 
-        def handle_charref(self, ref):
-            t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
-            self.handle_data(t)
-        def handle_entityref(self, ref):
-            t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
-            self.handle_data(t)
-        def unescape_attrs(self, attrs):
-            escaped_attrs = []
-            for key, val in attrs:
-                val = unescape(val, self._entitydefs, self._encoding)
-                escaped_attrs.append((key, val))
-            return escaped_attrs
+    def __init__(self, encoding, text=None, avoidParserProblems=True,
+                 initialTextIsEverything=True):
+        self._encoding = encoding
+        _beautifulsoup.BeautifulSoup.__init__(
+            self, text, avoidParserProblems, initialTextIsEverything)
 
+    def handle_charref(self, ref):
+        t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
+        self.handle_data(t)
+    def handle_entityref(self, ref):
+        t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
+        self.handle_data(t)
+    def unescape_attrs(self, attrs):
+        escaped_attrs = []
+        for key, val in attrs:
+            val = unescape(val, self._entitydefs, self._encoding)
+            escaped_attrs.append((key, val))
+        return escaped_attrs
+
 class RobustLinksFactory:
 
     compress_re = re.compile(r"\s+")
@@ -379,7 +335,7 @@
                  link_class=Link,
                  urltags=None,
                  ):
-        import BeautifulSoup
+        import _beautifulsoup
         if link_parser_class is None:
             link_parser_class = MechanizeBs
         self.link_parser_class = link_parser_class
@@ -402,27 +358,29 @@
         self._encoding = encoding
 
     def links(self):
-        import BeautifulSoup
+        import _beautifulsoup
         bs = self._bs
         base_url = self._base_url
         encoding = self._encoding
         gen = bs.recursiveChildGenerator()
         for ch in bs.recursiveChildGenerator():
-            if (isinstance(ch, BeautifulSoup.Tag) and
+            if (isinstance(ch, _beautifulsoup.Tag) and
                 ch.name in self.urltags.keys()+["base"]):
                 link = ch
                 attrs = bs.unescape_attrs(link.attrs)
                 attrs_dict = dict(attrs)
                 if link.name == "base":
-                    base_url = attrs_dict.get("href")
+                    base_href = attrs_dict.get("href")
+                    if base_href is not None:
+                        base_url = base_href
                     continue
                 url_attr = self.urltags[link.name]
                 url = attrs_dict.get(url_attr)
                 if not url:
                     continue
-                url = clean_url(url, encoding)
+                url = _rfc3986.clean_url(url, encoding)
                 text = link.firstText(lambda t: True)
-                if text is BeautifulSoup.Null:
+                if text is _beautifulsoup.Null:
                     # follow _pullparser's weird behaviour rigidly
                     if link.name == "a":
                         text = ""
@@ -438,7 +396,7 @@
         import ClientForm
         args = form_parser_args(*args, **kwds)
         if args.form_parser_class is None:
-            args.form_parser_class = ClientForm.RobustFormParser
+            args.form_parser_class = RobustFormParser
         FormsFactory.__init__(self, **args.dictionary)
 
     def set_response(self, response, encoding):
@@ -454,10 +412,10 @@
         self._bs = soup
         self._encoding = encoding
 
-    def title(soup):
-        import BeautifulSoup
+    def title(self):
+        import _beautifulsoup
         title = self._bs.first("title")
-        if title == BeautifulSoup.Null:
+        if title == _beautifulsoup.Null:
             return None
         else:
             return title.firstText(lambda t: True)
@@ -477,18 +435,25 @@
 
     Public attributes:
 
+    Note that accessing these attributes may raise ParseError.
+
     encoding: string specifying the encoding of response if it contains a text
      document (this value is left unspecified for documents that do not have
      an encoding, e.g. an image file)
     is_html: true if response contains an HTML document (XHTML may be
      regarded as HTML too)
     title: page title, or None if no title or not HTML
+    global_form: form object containing all controls that are not descendants
+     of any FORM element, or None if the forms_factory does not support
+     supplying a global form
 
     """
 
+    LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
+
     def __init__(self, forms_factory, links_factory, title_factory,
-                 get_encoding=encoding_finder(DEFAULT_ENCODING),
-                 is_html_p=make_is_html(allow_xhtml=False),
+                 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
+                 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
                  ):
         """
 
@@ -504,8 +469,8 @@
         self._forms_factory = forms_factory
         self._links_factory = links_factory
         self._title_factory = title_factory
-        self._get_encoding = get_encoding
-        self._is_html_p = is_html_p
+        self._encoding_finder = encoding_finder
+        self._response_type_finder = response_type_finder
 
         self.set_response(None)
 
@@ -521,51 +486,71 @@
     def set_response(self, response):
         """Set response.
 
-        The response must implement the same interface as objects returned by
-        urllib2.urlopen().
+        The response must either be None or implement the same interface as
+        objects returned by urllib2.urlopen().
 
         """
         self._response = response
         self._forms_genf = self._links_genf = None
         self._get_title = None
-        for name in ["encoding", "is_html", "title"]:
+        for name in self.LAZY_ATTRS:
             try:
                 delattr(self, name)
             except AttributeError:
                 pass
 
     def __getattr__(self, name):
-        if name not in ["encoding", "is_html", "title"]:
+        if name not in self.LAZY_ATTRS:
             return getattr(self.__class__, name)
 
-        try:
-            if name == "encoding":
-                self.encoding = self._get_encoding(self._response)
-                return self.encoding
-            elif name == "is_html":
-                self.is_html = self._is_html_p(self._response, self.encoding)
-                return self.is_html
-            elif name == "title":
-                if self.is_html:
-                    self.title = self._title_factory.title()
-                else:
-                    self.title = None
-                return self.title
-        finally:
-            self._response.seek(0)
+        if name == "encoding":
+            self.encoding = self._encoding_finder.encoding(
+                copy.copy(self._response))
+            return self.encoding
+        elif name == "is_html":
+            self.is_html = self._response_type_finder.is_html(
+                copy.copy(self._response), self.encoding)
+            return self.is_html
+        elif name == "title":
+            if self.is_html:
+                self.title = self._title_factory.title()
+            else:
+                self.title = None
+            return self.title
+        elif name == "global_form":
+            self.forms()
+            return self.global_form
 
     def forms(self):
-        """Return iterable over ClientForm.HTMLForm-like objects."""
+        """Return iterable over ClientForm.HTMLForm-like objects.
+
+        Raises mechanize.ParseError on failure.
+        """
+        # this implementation sets .global_form as a side-effect, for benefit
+        # of __getattr__ impl
         if self._forms_genf is None:
-            self._forms_genf = CachingGeneratorFunction(
-                self._forms_factory.forms())
+            try:
+                self._forms_genf = CachingGeneratorFunction(
+                    self._forms_factory.forms())
+            except:  # XXXX define exception!
+                self.set_response(self._response)
+                raise
+            self.global_form = getattr(
+                self._forms_factory, "global_form", None)
         return self._forms_genf()
 
     def links(self):
-        """Return iterable over mechanize.Link-like objects."""
+        """Return iterable over mechanize.Link-like objects.
+
+        Raises mechanize.ParseError on failure.
+        """
         if self._links_genf is None:
-            self._links_genf = CachingGeneratorFunction(
-                self._links_factory.links())
+            try:
+                self._links_genf = CachingGeneratorFunction(
+                    self._links_factory.links())
+            except:  # XXXX define exception!
+                self.set_response(self._response)
+                raise
         return self._links_genf()
 
 class DefaultFactory(Factory):
@@ -576,7 +561,8 @@
             forms_factory=FormsFactory(),
             links_factory=LinksFactory(),
             title_factory=TitleFactory(),
-            is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+            response_type_finder=ResponseTypeFinder(
+                allow_xhtml=i_want_broken_xhtml_support),
             )
 
     def set_response(self, response):
@@ -585,7 +571,7 @@
             self._forms_factory.set_response(
                 copy.copy(response), self.encoding)
             self._links_factory.set_response(
-                copy.copy(response), self._response.geturl(), self.encoding)
+                copy.copy(response), response.geturl(), self.encoding)
             self._title_factory.set_response(
                 copy.copy(response), self.encoding)
 
@@ -601,19 +587,21 @@
             forms_factory=RobustFormsFactory(),
             links_factory=RobustLinksFactory(),
             title_factory=RobustTitleFactory(),
-            is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+            response_type_finder=ResponseTypeFinder(
+                allow_xhtml=i_want_broken_xhtml_support),
             )
         if soup_class is None:
             soup_class = MechanizeBs
         self._soup_class = soup_class
 
     def set_response(self, response):
-        import BeautifulSoup
+        import _beautifulsoup
         Factory.set_response(self, response)
         if response is not None:
             data = response.read()
             soup = self._soup_class(self.encoding, data)
-            self._forms_factory.set_response(response, self.encoding)
+            self._forms_factory.set_response(
+                copy.copy(response), self.encoding)
             self._links_factory.set_soup(
                 soup, response.geturl(), self.encoding)
             self._title_factory.set_soup(soup, self.encoding)

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,729 @@
+"""HTTP related handlers.
+
+Note that some other HTTP handlers live in more specific modules: _auth.py,
+_gzip.py, etc.
+
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
+       urllib2, urllib, httplib, sgmllib
+from urllib2 import URLError, HTTPError, BaseHandler
+from cStringIO import StringIO
+
+from _request import Request
+from _util import isstringlike
+from _response import closeable_response, response_seek_wrapper
+from _html import unescape, unescape_charref
+from _headersutil import is_html
+from _clientcookie import CookieJar, request_host
+import _rfc3986
+
+debug = logging.getLogger("mechanize").debug
+
+# monkeypatch urllib2.HTTPError to show URL
+## def urllib2_str(self):
+##     return 'HTTP Error %s: %s (%s)' % (
+##         self.code, self.msg, self.geturl())
+## urllib2.HTTPError.__str__ = urllib2_str
+
+
+CHUNK = 1024  # size of chunks fed to HTML HEAD parser, in bytes
+DEFAULT_ENCODING = 'latin-1'
+
+
+# This adds "refresh" to the list of redirectables and provides a redirection
+# algorithm that doesn't go into a loop in the presence of cookies
+# (Python 2.4 has this new algorithm, 2.3 doesn't).
+class HTTPRedirectHandler(BaseHandler):
+    # maximum number of redirections to any single URL
+    # this is needed because of the state that cookies introduce
+    max_repeats = 4
+    # maximum total number of redirections (regardless of URL) before
+    # assuming we're in a loop
+    max_redirections = 10
+
+    # Implementation notes:
+
+    # To avoid the server sending us into an infinite loop, the request
+    # object needs to track what URLs we have already seen.  Do this by
+    # adding a handler-specific attribute to the Request object.  The value
+    # of the dict is used to count the number of times the same URL has
+    # been visited.  This is needed because visiting the same URL twice
+    # does not necessarily imply a loop, thanks to state introduced by
+    # cookies.
+
+    # Always unhandled redirection codes:
+    # 300 Multiple Choices: should not handle this here.
+    # 304 Not Modified: no need to handle here: only of interest to caches
+    #     that do conditional GETs
+    # 305 Use Proxy: probably not worth dealing with here
+    # 306 Unused: what was this for in the previous versions of protocol??
+
+    def redirect_request(self, newurl, req, fp, code, msg, headers):
+        """Return a Request or None in response to a redirect.
+
+        This is called by the http_error_30x methods when a redirection
+        response is received.  If a redirection should take place, return a
+        new Request to allow http_error_30x to perform the redirect;
+        otherwise, return None to indicate that an HTTPError should be
+        raised.
+
+        """
+        if code in (301, 302, 303, "refresh") or \
+               (code == 307 and not req.has_data()):
+            # Strictly (according to RFC 2616), 301 or 302 in response to
+            # a POST MUST NOT cause a redirection without confirmation
+            # from the user (of urllib2, in this case).  In practice,
+            # essentially all clients do redirect in this case, so we do
+            # the same.
+            # XXX really refresh redirections should be visiting; tricky to
+            #  fix, so this will wait until post-stable release
+            new = Request(newurl,
+                          headers=req.headers,
+                          origin_req_host=req.get_origin_req_host(),
+                          unverifiable=True,
+                          visit=False,
+                          )
+            new._origin_req = getattr(req, "_origin_req", req)
+            return new
+        else:
+            raise HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+    def http_error_302(self, req, fp, code, msg, headers):
+        # Some servers (incorrectly) return multiple Location headers
+        # (so probably same goes for URI).  Use first header.
+        if headers.has_key('location'):
+            newurl = headers.getheaders('location')[0]
+        elif headers.has_key('uri'):
+            newurl = headers.getheaders('uri')[0]
+        else:
+            return
+        newurl = _rfc3986.clean_url(newurl, "latin-1")
+        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
+
+        # XXX Probably want to forget about the state of the current
+        # request, although that might interact poorly with other
+        # handlers that also use handler-specific request attributes
+        new = self.redirect_request(newurl, req, fp, code, msg, headers)
+        if new is None:
+            return
+
+        # loop detection
+        # .redirect_dict has a key url if url was previously visited.
+        if hasattr(req, 'redirect_dict'):
+            visited = new.redirect_dict = req.redirect_dict
+            if (visited.get(newurl, 0) >= self.max_repeats or
+                len(visited) >= self.max_redirections):
+                raise HTTPError(req.get_full_url(), code,
+                                self.inf_msg + msg, headers, fp)
+        else:
+            visited = new.redirect_dict = req.redirect_dict = {}
+        visited[newurl] = visited.get(newurl, 0) + 1
+
+        # Don't close the fp until we are sure that we won't use it
+        # with HTTPError.  
+        fp.read()
+        fp.close()
+
+        return self.parent.open(new)
+
+    http_error_301 = http_error_303 = http_error_307 = http_error_302
+    http_error_refresh = http_error_302
+
+    inf_msg = "The HTTP server returned a redirect error that would " \
+              "lead to an infinite loop.\n" \
+              "The last 30x error message was:\n"
+
+
+# XXX would self.reset() work, instead of raising this exception?
+class EndOfHeadError(Exception): pass
+class AbstractHeadParser:
+    # only these elements are allowed in or before HEAD of document
+    head_elems = ("html", "head",
+                  "title", "base",
+                  "script", "style", "meta", "link", "object")
+    _entitydefs = htmlentitydefs.name2codepoint
+    _encoding = DEFAULT_ENCODING
+
+    def __init__(self):
+        self.http_equiv = []
+
+    def start_meta(self, attrs):
+        http_equiv = content = None
+        for key, value in attrs:
+            if key == "http-equiv":
+                http_equiv = self.unescape_attr_if_required(value)
+            elif key == "content":
+                content = self.unescape_attr_if_required(value)
+        if http_equiv is not None and content is not None:
+            self.http_equiv.append((http_equiv, content))
+
+    def end_head(self):
+        raise EndOfHeadError()
+
+    def handle_entityref(self, name):
+        #debug("%s", name)
+        self.handle_data(unescape(
+            '&%s;' % name, self._entitydefs, self._encoding))
+
+    def handle_charref(self, name):
+        #debug("%s", name)
+        self.handle_data(unescape_charref(name, self._encoding))
+
+    def unescape_attr(self, name):
+        #debug("%s", name)
+        return unescape(name, self._entitydefs, self._encoding)
+
+    def unescape_attrs(self, attrs):
+        #debug("%s", attrs)
+        escaped_attrs = {}
+        for key, val in attrs.items():
+            escaped_attrs[key] = self.unescape_attr(val)
+        return escaped_attrs
+
+    def unknown_entityref(self, ref):
+        self.handle_data("&%s;" % ref)
+
+    def unknown_charref(self, ref):
+        self.handle_data("&#%s;" % ref)
+
+
+try:
+    import HTMLParser
+except ImportError:
+    pass
+else:
+    class XHTMLCompatibleHeadParser(AbstractHeadParser,
+                                    HTMLParser.HTMLParser):
+        def __init__(self):
+            HTMLParser.HTMLParser.__init__(self)
+            AbstractHeadParser.__init__(self)
+
+        def handle_starttag(self, tag, attrs):
+            if tag not in self.head_elems:
+                raise EndOfHeadError()
+            try:
+                method = getattr(self, 'start_' + tag)
+            except AttributeError:
+                try:
+                    method = getattr(self, 'do_' + tag)
+                except AttributeError:
+                    pass # unknown tag
+                else:
+                    method(attrs)
+            else:
+                method(attrs)
+
+        def handle_endtag(self, tag):
+            if tag not in self.head_elems:
+                raise EndOfHeadError()
+            try:
+                method = getattr(self, 'end_' + tag)
+            except AttributeError:
+                pass # unknown tag
+            else:
+                method()
+
+        def unescape(self, name):
+            # Use the entitydefs passed into constructor, not
+            # HTMLParser.HTMLParser's entitydefs.
+            return self.unescape_attr(name)
+
+        def unescape_attr_if_required(self, name):
+            return name  # HTMLParser.HTMLParser already did it
+
+class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
+
+    def _not_called(self):
+        assert False
+
+    def __init__(self):
+        sgmllib.SGMLParser.__init__(self)
+        AbstractHeadParser.__init__(self)
+
+    def handle_starttag(self, tag, method, attrs):
+        if tag not in self.head_elems:
+            raise EndOfHeadError()
+        if tag == "meta":
+            method(attrs)
+
+    def unknown_starttag(self, tag, attrs):
+        self.handle_starttag(tag, self._not_called, attrs)
+
+    def handle_endtag(self, tag, method):
+        if tag in self.head_elems:
+            method()
+        else:
+            raise EndOfHeadError()
+
+    def unescape_attr_if_required(self, name):
+        return self.unescape_attr(name)
+
+def parse_head(fileobj, parser):
+    """Return a list of key, value pairs."""
+    while 1:
+        data = fileobj.read(CHUNK)
+        try:
+            parser.feed(data)
+        except EndOfHeadError:
+            break
+        if len(data) != CHUNK:
+            # this should only happen if there is no HTML body, or if
+            # CHUNK is big
+            break
+    return parser.http_equiv
+
+class HTTPEquivProcessor(BaseHandler):
+    """Append META HTTP-EQUIV headers to regular HTTP headers."""
+
+    handler_order = 300  # before handlers that look at HTTP headers
+
+    def __init__(self, head_parser_class=HeadParser,
+                 i_want_broken_xhtml_support=False,
+                 ):
+        self.head_parser_class = head_parser_class
+        self._allow_xhtml = i_want_broken_xhtml_support
+
+    def http_response(self, request, response):
+        if not hasattr(response, "seek"):
+            response = response_seek_wrapper(response)
+        http_message = response.info()
+        url = response.geturl()
+        ct_hdrs = http_message.getheaders("content-type")
+        if is_html(ct_hdrs, url, self._allow_xhtml):
+            try:
+                try:
+                    html_headers = parse_head(response, self.head_parser_class())
+                finally:
+                    response.seek(0)
+            except (HTMLParser.HTMLParseError,
+                    sgmllib.SGMLParseError):
+                pass
+            else:
+                for hdr, val in html_headers:
+                    # add a header
+                    http_message.dict[hdr.lower()] = val
+                    text = hdr + ": " + val
+                    for line in text.split("\n"):
+                        http_message.headers.append(line + "\n")
+        return response
+
+    https_response = http_response
+
+class HTTPCookieProcessor(BaseHandler):
+    """Handle HTTP cookies.
+
+    Public attributes:
+
+    cookiejar: CookieJar instance
+
+    """
+    def __init__(self, cookiejar=None):
+        if cookiejar is None:
+            cookiejar = CookieJar()
+        self.cookiejar = cookiejar
+
+    def http_request(self, request):
+        self.cookiejar.add_cookie_header(request)
+        return request
+
+    def http_response(self, request, response):
+        self.cookiejar.extract_cookies(response, request)
+        return response
+
+    https_request = http_request
+    https_response = http_response
+
+try:
+    import robotparser
+except ImportError:
+    pass
+else:
+    class MechanizeRobotFileParser(robotparser.RobotFileParser):
+
+        def __init__(self, url='', opener=None):
+            import _opener
+            robotparser.RobotFileParser.__init__(self, url)
+            self._opener = opener
+
+        def set_opener(self, opener=None):
+            if opener is None:
+                opener = _opener.OpenerDirector()
+            self._opener = opener
+
+        def read(self):
+            """Reads the robots.txt URL and feeds it to the parser."""
+            if self._opener is None:
+                self.set_opener()
+            req = Request(self.url, unverifiable=True, visit=False)
+            try:
+                f = self._opener.open(req)
+            except HTTPError, f:
+                pass
+            except (IOError, socket.error, OSError), exc:
+                robotparser._debug("ignoring error opening %r: %s" %
+                                   (self.url, exc))
+                return
+            lines = []
+            line = f.readline()
+            while line:
+                lines.append(line.strip())
+                line = f.readline()
+            status = f.code
+            if status == 401 or status == 403:
+                self.disallow_all = True
+                robotparser._debug("disallow all")
+            elif status >= 400:
+                self.allow_all = True
+                robotparser._debug("allow all")
+            elif status == 200 and lines:
+                robotparser._debug("parse lines")
+                self.parse(lines)
+
+    class RobotExclusionError(urllib2.HTTPError):
+        def __init__(self, request, *args):
+            apply(urllib2.HTTPError.__init__, (self,)+args)
+            self.request = request
+
+    class HTTPRobotRulesProcessor(BaseHandler):
+        # before redirections, after everything else
+        handler_order = 800
+
+        try:
+            from httplib import HTTPMessage
+        except:
+            from mimetools import Message
+            http_response_class = Message
+        else:
+            http_response_class = HTTPMessage
+
+        def __init__(self, rfp_class=MechanizeRobotFileParser):
+            self.rfp_class = rfp_class
+            self.rfp = None
+            self._host = None
+
+        def http_request(self, request):
+            scheme = request.get_type()
+            if scheme not in ["http", "https"]:
+                # robots exclusion only applies to HTTP
+                return request
+
+            if request.get_selector() == "/robots.txt":
+                # /robots.txt is always OK to fetch
+                return request
+
+            host = request.get_host()
+
+            # robots.txt requests don't need to be allowed by robots.txt :-)
+            origin_req = getattr(request, "_origin_req", None)
+            if (origin_req is not None and
+                origin_req.get_selector() == "/robots.txt" and
+                origin_req.get_host() == host
+                ):
+                return request
+
+            if host != self._host:
+                self.rfp = self.rfp_class()
+                try:
+                    self.rfp.set_opener(self.parent)
+                except AttributeError:
+                    debug("%r instance does not support set_opener" %
+                          self.rfp.__class__)
+                self.rfp.set_url(scheme+"://"+host+"/robots.txt")
+                self.rfp.read()
+                self._host = host
+
+            ua = request.get_header("User-agent", "")
+            if self.rfp.can_fetch(ua, request.get_full_url()):
+                return request
+            else:
+                # XXX This should really have raised URLError.  Too late now...
+                msg = "request disallowed by robots.txt"
+                raise RobotExclusionError(
+                    request,
+                    request.get_full_url(),
+                    403, msg,
+                    self.http_response_class(StringIO()), StringIO(msg))
+
+        https_request = http_request
+
+class HTTPRefererProcessor(BaseHandler):
+    """Add Referer header to requests.
+
+    This only makes sense if you use each RefererProcessor for a single
+    chain of requests only (so, for example, if you use a single
+    HTTPRefererProcessor to fetch a series of URLs extracted from a single
+    page, this will break).
+
+    There's a proper implementation of this in mechanize.Browser.
+
+    """
+    def __init__(self):
+        self.referer = None
+
+    def http_request(self, request):
+        if ((self.referer is not None) and
+            not request.has_header("Referer")):
+            request.add_unredirected_header("Referer", self.referer)
+        return request
+
+    def http_response(self, request, response):
+        self.referer = response.geturl()
+        return response
+
+    https_request = http_request
+    https_response = http_response
+
+
+def clean_refresh_url(url):
+    # e.g. Firefox 1.5 does (something like) this
+    if ((url.startswith('"') and url.endswith('"')) or
+        (url.startswith("'") and url.endswith("'"))):
+        url = url[1:-1]
+    return _rfc3986.clean_url(url, "latin-1")  # XXX encoding
+
+def parse_refresh_header(refresh):
+    """
+    >>> parse_refresh_header("1; url=http://example.com/")
+    (1.0, 'http://example.com/')
+    >>> parse_refresh_header("1; url='http://example.com/'")
+    (1.0, 'http://example.com/')
+    >>> parse_refresh_header("1")
+    (1.0, None)
+    >>> parse_refresh_header("blah")
+    Traceback (most recent call last):
+    ValueError: invalid literal for float(): blah
+
+    """
+
+    ii = refresh.find(";")
+    if ii != -1:
+        pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
+        jj = newurl_spec.find("=")
+        key = None
+        if jj != -1:
+            key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
+            newurl = clean_refresh_url(newurl)
+        if key is None or key.strip().lower() != "url":
+            raise ValueError()
+    else:
+        pause, newurl = float(refresh), None
+    return pause, newurl
+
+class HTTPRefreshProcessor(BaseHandler):
+    """Perform HTTP Refresh redirections.
+
+    Note that if a non-200 HTTP code has occurred (for example, a 30x
+    redirect), this processor will do nothing.
+
+    By default, only zero-time Refresh headers are redirected.  Use the
+    max_time attribute / constructor argument to allow Refresh with longer
+    pauses.  Use the honor_time attribute / constructor argument to control
+    whether the requested pause is honoured (with a time.sleep()) or
+    skipped in favour of immediate redirection.
+
+    Public attributes:
+
+    max_time: see above
+    honor_time: see above
+
+    """
+    handler_order = 1000
+
+    def __init__(self, max_time=0, honor_time=True):
+        self.max_time = max_time
+        self.honor_time = honor_time
+
+    def http_response(self, request, response):
+        code, msg, hdrs = response.code, response.msg, response.info()
+
+        if code == 200 and hdrs.has_key("refresh"):
+            refresh = hdrs.getheaders("refresh")[0]
+            try:
+                pause, newurl = parse_refresh_header(refresh)
+            except ValueError:
+                debug("bad Refresh header: %r" % refresh)
+                return response
+            if newurl is None:
+                newurl = response.geturl()
+            if (self.max_time is None) or (pause <= self.max_time):
+                if pause > 1E-3 and self.honor_time:
+                    time.sleep(pause)
+                hdrs["location"] = newurl
+                # hardcoded http is NOT a bug
+                response = self.parent.error(
+                    "http", request, response,
+                    "refresh", msg, hdrs)
+
+        return response
+
+    https_response = http_response
+
+class HTTPErrorProcessor(BaseHandler):
+    """Process HTTP error responses.
+
+    The purpose of this handler is to to allow other response processors a
+    look-in by removing the call to parent.error() from
+    AbstractHTTPHandler.
+
+    For non-200 error codes, this just passes the job on to the
+    Handler.<proto>_error_<code> methods, via the OpenerDirector.error
+    method.  Eventually, urllib2.HTTPDefaultErrorHandler will raise an
+    HTTPError if no other handler handles the error.
+
+    """
+    handler_order = 1000  # after all other processors
+
+    def http_response(self, request, response):
+        code, msg, hdrs = response.code, response.msg, response.info()
+
+        if code != 200:
+            # hardcoded http is NOT a bug
+            response = self.parent.error(
+                "http", request, response, code, msg, hdrs)
+
+        return response
+
+    https_response = http_response
+
+
+class HTTPDefaultErrorHandler(BaseHandler):
+    def http_error_default(self, req, fp, code, msg, hdrs):
+        # why these error methods took the code, msg, headers args in the first
+        # place rather than a response object, I don't know, but to avoid
+        # multiple wrapping, we're discarding them
+
+        if isinstance(fp, urllib2.HTTPError):
+            response = fp
+        else:
+            response = urllib2.HTTPError(
+                req.get_full_url(), code, msg, hdrs, fp)
+        assert code == response.code
+        assert msg == response.msg
+        assert hdrs == response.hdrs
+        raise response
+
+
+class AbstractHTTPHandler(BaseHandler):
+
+    def __init__(self, debuglevel=0):
+        self._debuglevel = debuglevel
+
+    def set_http_debuglevel(self, level):
+        self._debuglevel = level
+
+    def do_request_(self, request):
+        host = request.get_host()
+        if not host:
+            raise URLError('no host given')
+
+        if request.has_data():  # POST
+            data = request.get_data()
+            if not request.has_header('Content-type'):
+                request.add_unredirected_header(
+                    'Content-type',
+                    'application/x-www-form-urlencoded')
+
+        scheme, sel = urllib.splittype(request.get_selector())
+        sel_host, sel_path = urllib.splithost(sel)
+        if not request.has_header('Host'):
+            request.add_unredirected_header('Host', sel_host or host)
+        for name, value in self.parent.addheaders:
+            name = name.capitalize()
+            if not request.has_header(name):
+                request.add_unredirected_header(name, value)
+
+        return request
+
+    def do_open(self, http_class, req):
+        """Return an addinfourl object for the request, using http_class.
+
+        http_class must implement the HTTPConnection API from httplib.
+        The addinfourl return value is a file-like object.  It also
+        has methods and attributes including:
+            - info(): return a mimetools.Message object for the headers
+            - geturl(): return the original request URL
+            - code: HTTP status code
+        """
+        host = req.get_host()
+        if not host:
+            raise URLError('no host given')
+
+        h = http_class(host) # will parse host:port
+        h.set_debuglevel(self._debuglevel)
+
+        headers = dict(req.headers)
+        headers.update(req.unredirected_hdrs)
+        # We want to make an HTTP/1.1 request, but the addinfourl
+        # class isn't prepared to deal with a persistent connection.
+        # It will try to read all remaining data from the socket,
+        # which will block while the server waits for the next request.
+        # So make sure the connection gets closed after the (only)
+        # request.
+        headers["Connection"] = "close"
+        headers = dict(
+            [(name.title(), val) for name, val in headers.items()])
+        try:
+            h.request(req.get_method(), req.get_selector(), req.data, headers)
+            r = h.getresponse()
+        except socket.error, err: # XXX what error?
+            raise URLError(err)
+
+        # Pick apart the HTTPResponse object to get the addinfourl
+        # object initialized properly.
+
+        # Wrap the HTTPResponse object in socket's file object adapter
+        # for Windows.  That adapter calls recv(), so delegate recv()
+        # to read().  This weird wrapping allows the returned object to
+        # have readline() and readlines() methods.
+
+        # XXX It might be better to extract the read buffering code
+        # out of socket._fileobject() and into a base class.
+
+        r.recv = r.read
+        fp = socket._fileobject(r)
+
+        resp = closeable_response(fp, r.msg, req.get_full_url(),
+                                  r.status, r.reason)
+        return resp
+
+
+class HTTPHandler(AbstractHTTPHandler):
+    def http_open(self, req):
+        return self.do_open(httplib.HTTPConnection, req)
+
+    http_request = AbstractHTTPHandler.do_request_
+
+if hasattr(httplib, 'HTTPS'):
+
+    class HTTPSConnectionFactory:
+        def __init__(self, key_file, cert_file):
+            self._key_file = key_file
+            self._cert_file = cert_file
+        def __call__(self, hostport):
+            return httplib.HTTPSConnection(
+                hostport,
+                key_file=self._key_file, cert_file=self._cert_file)
+
+    class HTTPSHandler(AbstractHTTPHandler):
+        def __init__(self, client_cert_manager=None):
+            AbstractHTTPHandler.__init__(self)
+            self.client_cert_manager = client_cert_manager
+
+        def https_open(self, req):
+            if self.client_cert_manager is not None:
+                key_file, cert_file = self.client_cert_manager.find_key_cert(
+                    req.get_full_url())
+                conn_factory = HTTPSConnectionFactory(key_file, cert_file)
+            else:
+                conn_factory = httplib.HTTPSConnection
+            return self.do_open(conn_factory, req)
+
+        https_request = AbstractHTTPHandler.do_request_


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -18,12 +18,12 @@
 
 """
 
-import time, re, string, logging
+import time, re, logging
 
 from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
      MISSING_FILENAME_TEXT, LoadError
 from _headersutil import join_header_words, split_header_words
-from _util import startswith, iso2time, time2isoz
+from _util import iso2time, time2isoz
 
 debug = logging.getLogger("mechanize").debug
 
@@ -89,7 +89,7 @@
                 debug("   Not saving %s: expired", cookie.name)
                 continue
             r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
-        return string.join(r+[""], "\n")
+        return "\n".join(r+[""])
 
     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
         if filename is None:
@@ -127,9 +127,9 @@
             while 1:
                 line = f.readline()
                 if line == "": break
-                if not startswith(line, header):
+                if not line.startswith(header):
                     continue
-                line = string.strip(line[len(header):])
+                line = line[len(header):].strip()
 
                 for data in split_header_words([line]):
                     name, value = data[0]
@@ -139,7 +139,7 @@
                         standard[k] = False
                     for k, v in data[1:]:
                         if k is not None:
-                            lc = string.lower(k)
+                            lc = k.lower()
                         else:
                             lc = None
                         # don't lose case distinction for unknown fields
@@ -161,7 +161,7 @@
                     if expires is None:
                         discard = True
                     domain = h("domain")
-                    domain_specified = startswith(domain, ".")
+                    domain_specified = domain.startswith(".")
                     c = Cookie(h("version"), name, value,
                                h("port"), h("port_spec"),
                                domain, domain_specified, h("domain_dot"),

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,14 +9,15 @@
 
 """
 
-import urllib2, urlparse, sys, copy, re
+import urllib2, sys, copy, re
 
-from _useragent import UserAgent
+from _useragent import UserAgentBase
 from _html import DefaultFactory
-from _util import response_seek_wrapper, closeable_response
+import _response
 import _request
+import _rfc3986
 
-__version__ = (0, 1, 2, "b", None)  # 0.1.2b
+__version__ = (0, 1, 7, "b", None)  # 0.1.7b
 
 class BrowserStateError(Exception): pass
 class LinkNotFoundError(Exception): pass
@@ -45,60 +46,28 @@
     def clear(self):
         del self._history[:]
     def close(self):
-        """
-            If nothing has been added, .close should work.
-
-                >>> history = History()
-                >>> history.close()
-
-            Under some circumstances response can be None, in that case
-            this method should not raise an exception.
-
-                >>> history.add(None, None)
-                >>> history.close()
-        """
         for request, response in self._history:
             if response is not None:
                 response.close()
         del self._history[:]
 
-# Horrible, but needed, at least until fork urllib2.  Even then, may want
-# to preseve urllib2 compatibility.
-def upgrade_response(response):
-    # a urllib2 handler constructed the response, i.e. the response is an
-    # urllib.addinfourl, instead of a _Util.closeable_response as returned
-    # by e.g. mechanize.HTTPHandler
-    try:
-        code = response.code
-    except AttributeError:
-        code = None
-    try:
-        msg = response.msg
-    except AttributeError:
-        msg = None
 
-    # may have already-.read() data from .seek() cache
-    data = None
-    get_data = getattr(response, "get_data", None)
-    if get_data:
-        data = get_data()
+class HTTPRefererProcessor(urllib2.BaseHandler):
+    def http_request(self, request):
+        # See RFC 2616 14.36.  The only times we know the source of the
+        # request URI has a URI associated with it are redirect, and
+        # Browser.click() / Browser.submit() / Browser.follow_link().
+        # Otherwise, it's the user's job to add any Referer header before
+        # .open()ing.
+        if hasattr(request, "redirect_dict"):
+            request = self.parent._add_referer_header(
+                request, origin_request=False)
+        return request
 
-    response = closeable_response(
-        response.fp, response.info(), response.geturl(), code, msg)
-    response = response_seek_wrapper(response)
-    if data:
-        response.set_data(data)
-    return response
-class ResponseUpgradeProcessor(urllib2.BaseHandler):
-    # upgrade responses to be .close()able without becoming unusable
-    handler_order = 0  # before anything else
-    def any_response(self, request, response):
-        if not hasattr(response, 'closeable_response'):
-            response = upgrade_response(response)
-        return response
+    https_request = http_request
 
 
-class Browser(UserAgent):
+class Browser(UserAgentBase):
     """Browser-like class with support for history, forms and links.
 
     BrowserStateError is raised whenever the browser is in the wrong state to
@@ -113,10 +82,10 @@
 
     """
 
-    handler_classes = UserAgent.handler_classes.copy()
-    handler_classes["_response_upgrade"] = ResponseUpgradeProcessor
-    default_others = copy.copy(UserAgent.default_others)
-    default_others.append("_response_upgrade")
+    handler_classes = copy.copy(UserAgentBase.handler_classes)
+    handler_classes["_referer"] = HTTPRefererProcessor
+    default_features = copy.copy(UserAgentBase.default_features)
+    default_features.append("_referer")
 
     def __init__(self,
                  factory=None,
@@ -128,8 +97,8 @@
         Only named arguments should be passed to this constructor.
 
         factory: object implementing the mechanize.Factory interface.
-        history: object implementing the mechanize.History interface.  Note this
-         interface is still experimental and may change in future.
+        history: object implementing the mechanize.History interface.  Note
+         this interface is still experimental and may change in future.
         request_class: Request class to use.  Defaults to mechanize.Request
          by default for Pythons older than 2.4, urllib2.Request otherwise.
 
@@ -142,11 +111,11 @@
         constructor, to ensure only one Request class is used.
 
         """
+        self._handle_referer = True
+
         if history is None:
             history = History()
         self._history = history
-        self.request = self._response = None
-        self.form = None
 
         if request_class is None:
             if not hasattr(urllib2.Request, "add_unredirected_header"):
@@ -160,48 +129,108 @@
         self._factory = factory
         self.request_class = request_class
 
-        UserAgent.__init__(self)  # do this last to avoid __getattr__ problems
+        self.request = None
+        self._set_response(None, False)
 
+        # do this last to avoid __getattr__ problems
+        UserAgentBase.__init__(self)
+
     def close(self):
+        UserAgentBase.close(self)
         if self._response is not None:
             self._response.close()    
-        UserAgent.close(self)
         if self._history is not None:
             self._history.close()
             self._history = None
+
+        # make use after .close easy to spot
+        self.form = None
         self.request = self._response = None
+        self.request = self.response = self.set_response = None
+        self.geturl =  self.reload = self.back = None
+        self.clear_history = self.set_cookie = self.links = self.forms = None
+        self.viewing_html = self.encoding = self.title = None
+        self.select_form = self.click = self.submit = self.click_link = None
+        self.follow_link = self.find_link = None
 
+    def set_handle_referer(self, handle):
+        """Set whether to add Referer header to each request.
+
+        This base class does not implement this feature (so don't turn this on
+        if you're using this base class directly), but the subclass
+        mechanize.Browser does.
+
+        """
+        self._set_handler("_referer", handle)
+        self._handle_referer = bool(handle)
+
+    def _add_referer_header(self, request, origin_request=True):
+        if self.request is None:
+            return request
+        scheme = request.get_type()
+        original_scheme = self.request.get_type()
+        if scheme not in ["http", "https"]:
+            return request
+        if not origin_request and not self.request.has_header("Referer"):
+            return request
+
+        if (self._handle_referer and
+            original_scheme in ["http", "https"] and
+            not (original_scheme == "https" and scheme != "https")):
+            # strip URL fragment (RFC 2616 14.36)
+            parts = _rfc3986.urlsplit(self.request.get_full_url())
+            parts = parts[:-1]+(None,)
+            referer = _rfc3986.urlunsplit(parts)
+            request.add_unredirected_header("Referer", referer)
+        return request
+
+    def open_novisit(self, url, data=None):
+        """Open a URL without visiting it.
+
+        The browser state (including .request, .response(), history, forms and
+        links) are all left unchanged by calling this function.
+
+        The interface is the same as for .open().
+
+        This is useful for things like fetching images.
+
+        See also .retrieve().
+
+        """
+        return self._mech_open(url, data, visit=False)
+
     def open(self, url, data=None):
-        if self._response is not None:
-            self._response.close()
         return self._mech_open(url, data)
 
-    def _mech_open(self, url, data=None, update_history=True):
+    def _mech_open(self, url, data=None, update_history=True, visit=None):
         try:
             url.get_full_url
         except AttributeError:
             # string URL -- convert to absolute URL if required
-            scheme, netloc = urlparse.urlparse(url)[:2]
-            if not scheme:
+            scheme, authority = _rfc3986.urlsplit(url)[:2]
+            if scheme is None:
                 # relative URL
-                assert not netloc, "malformed URL"
                 if self._response is None:
                     raise BrowserStateError(
-                        "can't fetch relative URL: not viewing any document")
-                url = urlparse.urljoin(self._response.geturl(), url)
+                        "can't fetch relative reference: "
+                        "not viewing any document")
+                url = _rfc3986.urljoin(self._response.geturl(), url)
 
-        if self.request is not None and update_history:
-            self._history.add(self.request, self._response)
-        self._response = None
-        # we want self.request to be assigned even if UserAgent.open fails
-        self.request = self._request(url, data)
-        self._previous_scheme = self.request.get_type()
+        request = self._request(url, data, visit)
+        visit = request.visit
+        if visit is None:
+            visit = True
 
+        if visit:
+            self._visit_request(request, update_history)
+
         success = True
         try:
-            response = UserAgent.open(self, self.request, data)
+            response = UserAgentBase.open(self, request, data)
         except urllib2.HTTPError, error:
             success = False
+            if error.fp is None:  # not a response
+                raise
             response = error
 ##         except (IOError, socket.error, OSError), error:
 ##             # Yes, urllib2 really does raise all these :-((
@@ -214,10 +243,16 @@
 ##             # Python core, a fix would need some backwards-compat. hack to be
 ##             # acceptable.
 ##             raise
-        self.set_response(response)
+
+        if visit:
+            self._set_response(response, False)
+            response = copy.copy(self._response)
+        elif response is not None:
+            response = _response.upgrade_response(response)
+
         if not success:
-            raise error
-        return copy.copy(self._response)
+            raise response
+        return response
 
     def __str__(self):
         text = []
@@ -241,24 +276,52 @@
         return copy.copy(self._response)
 
     def set_response(self, response):
-        """Replace current response with (a copy of) response."""
+        """Replace current response with (a copy of) response.
+
+        response may be None.
+
+        This is intended mostly for HTML-preprocessing.
+        """
+        self._set_response(response, True)
+
+    def _set_response(self, response, close_current):
         # sanity check, necessary but far from sufficient
-        if not (hasattr(response, "info") and hasattr(response, "geturl") and
-                hasattr(response, "read")):
+        if not (response is None or
+                (hasattr(response, "info") and hasattr(response, "geturl") and
+                 hasattr(response, "read")
+                 )
+                ):
             raise ValueError("not a response object")
 
         self.form = None
+        if response is not None:
+            response = _response.upgrade_response(response)
+        if close_current and self._response is not None:
+            self._response.close()
+        self._response = response
+        self._factory.set_response(response)
 
-        if not hasattr(response, "seek"):
-            response = response_seek_wrapper(response)
-        if not hasattr(response, "closeable_response"):
-            response = upgrade_response(response)
-        else:
-            response = copy.copy(response)
+    def visit_response(self, response, request=None):
+        """Visit the response, as if it had been .open()ed.
 
-        self._response = response
-        self._factory.set_response(self._response)
+        Unlike .set_response(), this updates history rather than replacing the
+        current response.
+        """
+        if request is None:
+            request = _request.Request(response.geturl())
+        self._visit_request(request, True)
+        self._set_response(response, False)
 
+    def _visit_request(self, request, update_history):
+        if self._response is not None:
+            self._response.close()
+        if self.request is not None and update_history:
+            self._history.add(self.request, self._response)
+        self._response = None
+        # we want self.request to be assigned even if UserAgentBase.open
+        # fails
+        self.request = request
+
     def geturl(self):
         """Get URL of current document."""
         if self._response is None:
@@ -283,11 +346,53 @@
             self._response.close()
         self.request, response = self._history.back(n, self._response)
         self.set_response(response)
-        return response
+        if not response.read_complete:
+            return self.reload()
+        return copy.copy(response)
 
     def clear_history(self):
         self._history.clear()
 
+    def set_cookie(self, cookie_string):
+        """Request to set a cookie.
+
+        Note that it is NOT necessary to call this method under ordinary
+        circumstances: cookie handling is normally entirely automatic.  The
+        intended use case is rather to simulate the setting of a cookie by
+        client script in a web page (e.g. JavaScript).  In that case, use of
+        this method is necessary because mechanize currently does not support
+        JavaScript, VBScript, etc.
+
+        The cookie is added in the same way as if it had arrived with the
+        current response, as a result of the current request.  This means that,
+        for example, it is not appropriate to set the cookie based on the
+        current request, no cookie will be set.
+
+        The cookie will be returned automatically with subsequent responses
+        made by the Browser instance whenever that's appropriate.
+
+        cookie_string should be a valid value of the Set-Cookie header.
+
+        For example:
+
+        browser.set_cookie(
+            "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
+
+        Currently, this method does not allow for adding RFC 2986 cookies.
+        This limitation will be lifted if anybody requests it.
+
+        """
+        if self._response is None:
+            raise BrowserStateError("not viewing any document")
+        if self.request.get_type() not in ["http", "https"]:
+            raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
+                                    "transactions")
+        cookiejar = self._ua_handlers["_cookies"].cookiejar
+        response = self.response()  # copy
+        headers = response.info()
+        headers["Set-cookie"] = cookie_string
+        cookiejar.extract_cookies(response, self.request)
+
     def links(self, **kwds):
         """Return iterable over links (mechanize.Link objects)."""
         if not self.viewing_html():
@@ -308,6 +413,24 @@
             raise BrowserStateError("not viewing HTML")
         return self._factory.forms()
 
+    def global_form(self):
+        """Return the global form object, or None if the factory implementation
+        did not supply one.
+
+        The "global" form object contains all controls that are not descendants of
+        any FORM element.
+
+        The returned form object implements the ClientForm.HTMLForm interface.
+
+        This is a separate method since the global form is not regarded as part
+        of the sequence of forms in the document -- mostly for
+        backwards-compatibility.
+
+        """
+        if not self.viewing_html():
+            raise BrowserStateError("not viewing HTML")
+        return self._factory.global_form
+
     def viewing_html(self):
         """Return whether the current response contains HTML data."""
         if self._response is None:
@@ -340,6 +463,10 @@
         interface, so you can call methods like .set_value(), .set(), and
         .click().
 
+        Another way to select a form is to assign to the .form attribute.  The
+        form assigned should be one of the objects returned by the .forms()
+        method.
+
         At least one of the name, predicate and nr arguments must be supplied.
         If no matching form is found, mechanize.FormNotFoundError is raised.
 
@@ -382,26 +509,6 @@
             description = ", ".join(description)
             raise FormNotFoundError("no form matching "+description)
 
-    def _add_referer_header(self, request, origin_request=True):
-        if self.request is None:
-            return request
-        scheme = request.get_type()
-        original_scheme = self.request.get_type()
-        if scheme not in ["http", "https"]:
-            return request
-        if not origin_request and not self.request.has_header("Referer"):
-            return request
-
-        if (self._handle_referer and
-            original_scheme in ["http", "https"] and
-            not (original_scheme == "https" and scheme != "https")):
-            # strip URL fragment (RFC 2616 14.36)
-            parts = urlparse.urlparse(self.request.get_full_url())
-            parts = parts[:-1]+("",)
-            referer = urlparse.urlunparse(parts)
-            request.add_unredirected_header("Referer", referer)
-        return request
-
     def click(self, *args, **kwds):
         """See ClientForm.HTMLForm.click for documentation."""
         if not self.viewing_html():
@@ -507,9 +614,6 @@
                 ".select_form()?)" % (self.__class__, name))
         return getattr(form, name)
 
-#---------------------------------------------------
-# Private methods.
-
     def _filter_links(self, links,
                     text=None, text_regex=None,
                     name=None, name_regex=None,

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,11 +9,10 @@
 
 """
 
-import re, string, time, logging
+import re, time, logging
 
 from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
      MISSING_FILENAME_TEXT, LoadError
-from _util import startswith, endswith
 debug = logging.getLogger("ClientCookie").debug
 
 
@@ -72,23 +71,23 @@
                 if line == "": break
 
                 # last field may be absent, so keep any trailing tab
-                if endswith(line, "\n"): line = line[:-1]
+                if line.endswith("\n"): line = line[:-1]
 
                 # skip comments and blank lines XXX what is $ for?
-                if (startswith(string.strip(line), "#") or
-                    startswith(string.strip(line), "$") or
-                    string.strip(line) == ""):
+                if (line.strip().startswith("#") or
+                    line.strip().startswith("$") or
+                    line.strip() == ""):
                     continue
 
                 domain, domain_specified, path, secure, expires, name, value = \
-                        string.split(line, "\t")
+                        line.split("\t")
                 secure = (secure == "TRUE")
                 domain_specified = (domain_specified == "TRUE")
                 if name == "":
                     name = value
                     value = None
 
-                initial_dot = startswith(domain, ".")
+                initial_dot = domain.startswith(".")
                 assert domain_specified == initial_dot
 
                 discard = False
@@ -137,7 +136,7 @@
                     continue
                 if cookie.secure: secure = "TRUE"
                 else: secure = "FALSE"
-                if startswith(cookie.domain, "."): initial_dot = "TRUE"
+                if cookie.domain.startswith("."): initial_dot = "TRUE"
                 else: initial_dot = "FALSE"
                 if cookie.expires is not None:
                     expires = str(cookie.expires)
@@ -153,8 +152,8 @@
                     name = cookie.name
                     value = cookie.value
                 f.write(
-                    string.join([cookie.domain, initial_dot, cookie.path,
-                                 secure, expires, name, value], "\t")+
+                    "\t".join([cookie.domain, initial_dot, cookie.path,
+                               secure, expires, name, value])+
                     "\n")
         finally:
             f.close()

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -11,13 +11,12 @@
 
 # XXX names and comments are not great here
 
-import os, re, string, time, struct, logging
+import os, re, time, struct, logging
 if os.name == "nt":
     import _winreg
 
 from _clientcookie import FileCookieJar, CookieJar, Cookie, \
      MISSING_FILENAME_TEXT, LoadError
-from _util import startswith
 
 debug = logging.getLogger("mechanize").debug
 
@@ -50,7 +49,7 @@
     return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
 
 def binary_to_char(c): return "%02X" % ord(c)
-def binary_to_str(d): return string.join(map(binary_to_char, list(d)), "")
+def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
 
 class MSIEBase:
     magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
@@ -153,7 +152,7 @@
             else:
                 discard = False
             domain = cookie["DOMAIN"]
-            initial_dot = startswith(domain, ".")
+            initial_dot = domain.startswith(".")
             if initial_dot:
                 domain_specified = True
             else:
@@ -201,7 +200,7 @@
         now = int(time.time())
 
         if username is None:
-            username = string.lower(os.environ['USERNAME'])
+            username = os.environ['USERNAME'].lower()
 
         cookie_dir = os.path.dirname(filename)
 

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,92 +9,31 @@
 
 """
 
-import urllib2, string, bisect, urlparse
-
-from _util import startswith, isstringlike
-from _request import Request
-
+import os, urllib2, bisect, urllib, httplib, types, tempfile
 try:
+    import threading as _threading
+except ImportError:
+    import dummy_threading as _threading
+try:
     set
 except NameError:
     import sets
     set = sets.Set
 
-def methnames(obj):
-    """Return method names of class instance.
+import _http
+import _upgrade
+import _rfc3986
+import _response
+from _util import isstringlike
+from _request import Request
 
-    dir(obj) doesn't work across Python versions, this does.
 
-    """
-    return methnames_of_instance_as_dict(obj).keys()
+class ContentTooShortError(urllib2.URLError):
+    def __init__(self, reason, result):
+        urllib2.URLError.__init__(self, reason)
+        self.result = result
 
-def methnames_of_instance_as_dict(inst):
-    """
-    It is possible for an attribute to be present in the results of dir(inst),
-    but for getattr(inst, attr_name) to raise an Attribute error, that should
-    be handled gracefully.
 
-        >>> class BadAttr(object):
-        ...     def error(self):
-        ...         raise AttributeError
-        ...     error = property(error)
-
-        >>> inst = BadAttr()
-        >>> 'error' in dir(inst)
-        True
-        >>> inst.error
-        Traceback (most recent call last):
-        ...
-        AttributeError
-
-        >>> result = methnames_of_instance_as_dict(inst) # no exception
-    """
-    names = {}
-    names.update(methnames_of_class_as_dict(inst.__class__))
-    for methname in dir(inst):
-        try:
-            candidate = getattr(inst, methname)
-        except AttributeError:
-            continue
-        if callable(candidate):
-            names[methname] = None
-    return names
-
-def methnames_of_class_as_dict(klass):
-    """
-    It is possible for an attribute to be present in the results of dir(inst),
-    but for getattr(inst, attr_name) to raise an Attribute error, that should
-    be handled gracefully.
-
-        >>> class BadClass(object):
-        ...     def error(self):
-        ...         raise AttributeError
-        ...     error = property(error)
-        ...     __bases__ = []
-
-        >>> klass = BadClass()
-        >>> 'error' in dir(klass)
-        True
-        >>> klass.error
-        Traceback (most recent call last):
-        ...
-        AttributeError
-
-        >>> result = methnames_of_class_as_dict(klass) # no exception
-    """
-    names = {}
-    for methname in dir(klass):
-        try:
-            candidate = getattr(klass, methname)
-        except AttributeError:
-            continue
-        if callable(candidate):
-            names[methname] = None
-    for baseclass in klass.__bases__:
-        names.update(methnames_of_class_as_dict(baseclass))
-    return names
-
-
 class OpenerDirector(urllib2.OpenerDirector):
     def __init__(self):
         urllib2.OpenerDirector.__init__(self)
@@ -105,6 +44,7 @@
         self._any_request = {}
         self._any_response = {}
         self._handler_index_valid = True
+        self._tempfiles = []
 
     def add_handler(self, handler):
         if handler in self.handlers:
@@ -128,7 +68,7 @@
 
         for handler in self.handlers:
             added = False
-            for meth in methnames(handler):
+            for meth in dir(handler):
                 if meth in ["redirect_request", "do_open", "proxy_open"]:
                     # oops, coincidental match
                     continue
@@ -146,8 +86,8 @@
                 scheme = meth[:ii]
                 condition = meth[ii+1:]
 
-                if startswith(condition, "error"):
-                    jj = string.find(meth[ii+1:], "_") + ii + 1
+                if condition.startswith("error"):
+                    jj = meth[ii+1:].find("_") + ii + 1
                     kind = meth[jj+1:]
                     try:
                         kind = int(kind)
@@ -198,18 +138,25 @@
         self._any_request = any_request
         self._any_response = any_response
 
-    def _request(self, url_or_req, data):
+    def _request(self, url_or_req, data, visit):
         if isstringlike(url_or_req):
-            req = Request(url_or_req, data)
+            req = Request(url_or_req, data, visit=visit)
         else:
             # already a urllib2.Request or mechanize.Request instance
             req = url_or_req
             if data is not None:
                 req.add_data(data)
+            # XXX yuck, give request a .visit attribute if it doesn't have one
+            try:
+                req.visit
+            except AttributeError:
+                req.visit = None
+            if visit is not None:
+                req.visit = visit
         return req
 
     def open(self, fullurl, data=None):
-        req = self._request(fullurl, data)
+        req = self._request(fullurl, data, None)
         req_scheme = req.get_type()
 
         self._maybe_reindex_handlers()
@@ -267,48 +214,208 @@
             args = (dict, 'default', 'http_error_default') + orig_args
             return apply(self._call_chain, args)
 
+    BLOCK_SIZE = 1024*8
     def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
         """Returns (filename, headers).
 
         For remote objects, the default filename will refer to a temporary
-        file.
+        file.  Temporary files are removed when the OpenerDirector.close()
+        method is called.
 
+        For file: URLs, at present the returned filename is None.  This may
+        change in future.
+
+        If the actual number of bytes read is less than indicated by the
+        Content-Length header, raises ContentTooShortError (a URLError
+        subclass).  The exception's .result attribute contains the (filename,
+        headers) that would have been returned.
+
         """
-        req = self._request(fullurl, data)
-        type_ = req.get_type()
+        req = self._request(fullurl, data, False)
+        scheme = req.get_type()
         fp = self.open(req)
         headers = fp.info()
-        if filename is None and type == 'file':
-            return url2pathname(req.get_selector()), headers
+        if filename is None and scheme == 'file':
+            # XXX req.get_selector() seems broken here, return None,
+            #   pending sanity :-/
+            return None, headers
+            #return urllib.url2pathname(req.get_selector()), headers
         if filename:
             tfp = open(filename, 'wb')
         else:
-            path = urlparse(fullurl)[2]
+            path = _rfc3986.urlsplit(fullurl)[2]
             suffix = os.path.splitext(path)[1]
-            tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+            fd, filename = tempfile.mkstemp(suffix)
+            self._tempfiles.append(filename)
+            tfp = os.fdopen(fd, 'wb')
+
         result = filename, headers
-        bs = 1024*8
+        bs = self.BLOCK_SIZE
         size = -1
         read = 0
-        blocknum = 1
+        blocknum = 0
         if reporthook:
-            if headers.has_key("content-length"):
+            if "content-length" in headers:
                 size = int(headers["Content-Length"])
-            reporthook(0, bs, size)
+            reporthook(blocknum, bs, size)
         while 1:
             block = fp.read(bs)
+            if block == "":
+                break
             read += len(block)
+            tfp.write(block)
+            blocknum += 1
             if reporthook:
                 reporthook(blocknum, bs, size)
-            blocknum = blocknum + 1
-            if not block:
-                break
-            tfp.write(block)
         fp.close()
         tfp.close()
         del fp
         del tfp
-        if size>=0 and read<size:
-            raise IOError("incomplete retrieval error",
-                          "got only %d bytes out of %d" % (read,size))
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError(
+                "retrieval incomplete: "
+                "got only %i out of %i bytes" % (read, size),
+                result
+                )
+
         return result
+
+    def close(self):
+        urllib2.OpenerDirector.close(self)
+
+        # make it very obvious this object is no longer supposed to be used
+        self.open = self.error = self.retrieve = self.add_handler = None
+
+        if self._tempfiles:
+            for filename in self._tempfiles:
+                try:
+                    os.unlink(filename)
+                except OSError:
+                    pass
+            del self._tempfiles[:]
+
+
+def wrapped_open(urlopen, process_response_object, fullurl, data=None):
+    success = True
+    try:
+        response = urlopen(fullurl, data)
+    except urllib2.HTTPError, error:
+        success = False
+        if error.fp is None:  # not a response
+            raise
+        response = error
+
+    if response is not None:
+        response = process_response_object(response)
+
+    if not success:
+        raise response
+    return response
+
+class ResponseProcessingOpener(OpenerDirector):
+
+    def open(self, fullurl, data=None):
+        def bound_open(fullurl, data=None):
+            return OpenerDirector.open(self, fullurl, data)
+        return wrapped_open(
+            bound_open, self.process_response_object, fullurl, data)
+
+    def process_response_object(self, response):
+        return response
+
+
+class SeekableResponseOpener(ResponseProcessingOpener):
+    def process_response_object(self, response):
+        return _response.seek_wrapped_response(response)
+
+
+class OpenerFactory:
+    """This class's interface is quite likely to change."""
+
+    default_classes = [
+        # handlers
+        urllib2.ProxyHandler,
+        urllib2.UnknownHandler,
+        _http.HTTPHandler,  # derived from new AbstractHTTPHandler
+        _http.HTTPDefaultErrorHandler,
+        _http.HTTPRedirectHandler,  # bugfixed
+        urllib2.FTPHandler,
+        urllib2.FileHandler,
+        # processors
+        _upgrade.HTTPRequestUpgradeProcessor,
+        _http.HTTPCookieProcessor,
+        _http.HTTPErrorProcessor,
+        ]
+    if hasattr(httplib, 'HTTPS'):
+        default_classes.append(_http.HTTPSHandler)
+    handlers = []
+    replacement_handlers = []
+
+    def __init__(self, klass=OpenerDirector):
+        self.klass = klass
+
+    def build_opener(self, *handlers):
+        """Create an opener object from a list of handlers and processors.
+
+        The opener will use several default handlers and processors, including
+        support for HTTP and FTP.
+
+        If any of the handlers passed as arguments are subclasses of the
+        default handlers, the default handlers will not be used.
+
+        """
+        opener = self.klass()
+        default_classes = list(self.default_classes)
+        skip = []
+        for klass in default_classes:
+            for check in handlers:
+                if type(check) == types.ClassType:
+                    if issubclass(check, klass):
+                        skip.append(klass)
+                elif type(check) == types.InstanceType:
+                    if isinstance(check, klass):
+                        skip.append(klass)
+        for klass in skip:
+            default_classes.remove(klass)
+
+        for klass in default_classes:
+            opener.add_handler(klass())
+        for h in handlers:
+            if type(h) == types.ClassType:
+                h = h()
+            opener.add_handler(h)
+
+        return opener
+
+
+build_opener = OpenerFactory().build_opener
+
+_opener = None
+urlopen_lock = _threading.Lock()
+def urlopen(url, data=None):
+    global _opener
+    if _opener is None:
+        urlopen_lock.acquire()
+        try:
+            if _opener is None:
+                _opener = build_opener()
+        finally:
+            urlopen_lock.release()
+    return _opener.open(url, data)
+
+def urlretrieve(url, filename=None, reporthook=None, data=None):
+    global _opener
+    if _opener is None:
+        urlopen_lock.acquire()
+        try:
+            if _opener is None:
+                _opener = build_opener()
+        finally:
+            urlopen_lock.release()
+    return _opener.retrieve(url, filename, reporthook, data)
+
+def install_opener(opener):
+    global _opener
+    _opener = opener

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -8,16 +8,33 @@
 
 """
 
-import urllib2, string
+import urllib2, urllib, logging
 
 from _clientcookie import request_host
+import _rfc3986
 
+warn = logging.getLogger("mechanize").warning
+# don't complain about missing logging handler
+logging.getLogger("mechanize").setLevel(logging.ERROR)
 
+
 class Request(urllib2.Request):
     def __init__(self, url, data=None, headers={},
-             origin_req_host=None, unverifiable=False):
+                 origin_req_host=None, unverifiable=False, visit=None):
+        # In mechanize 0.2, the interpretation of a unicode url argument will
+        # change: A unicode url argument will be interpreted as an IRI, and a
+        # bytestring as a URI. For now, we accept unicode or bytestring.  We
+        # don't insist that the value is always a URI (specifically, must only
+        # contain characters which are legal), because that might break working
+        # code (who knows what bytes some servers want to see, especially with
+        # browser plugins for internationalised URIs).
+        if not _rfc3986.is_clean_uri(url):
+            warn("url argument is not a URI "
+                 "(contains illegal characters) %r" % url)
         urllib2.Request.__init__(self, url, data, headers)
+        self.selector = None
         self.unredirected_hdrs = {}
+        self.visit = visit
 
         # All the terminology below comes from RFC 2965.
         self.unverifiable = unverifiable
@@ -31,6 +48,9 @@
             origin_req_host = request_host(self)
         self.origin_req_host = origin_req_host
 
+    def get_selector(self):
+        return urllib.splittag(self.__r_host)[0]
+
     def get_origin_req_host(self):
         return self.origin_req_host
 
@@ -39,14 +59,12 @@
 
     def add_unredirected_header(self, key, val):
         """Add a header that will not be added to a redirected request."""
-        self.unredirected_hdrs[string.capitalize(key)] = val
+        self.unredirected_hdrs[key.capitalize()] = val
 
     def has_header(self, header_name):
         """True iff request has named header (regular or unredirected)."""
-        if (self.headers.has_key(header_name) or
-            self.unredirected_hdrs.has_key(header_name)):
-            return True
-        return False
+        return (header_name in self.headers or
+                header_name in self.unredirected_hdrs)
 
     def get_header(self, header_name, default=None):
         return self.headers.get(

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,514 @@
+"""Response classes.
+
+The seek_wrapper code is not used if you're using UserAgent with
+.set_seekable_responses(False), or if you're using the urllib2-level interface
+without SeekableProcessor or HTTPEquivProcessor.  Class closeable_response is
+instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
+interface is only depended upon by Browser-level code.  Function
+upgrade_response is only used if you're using Browser or
+ResponseUpgradeProcessor.
+
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+import copy, mimetools
+from cStringIO import StringIO
+import urllib2
+
+# XXX Andrew Dalke kindly sent me a similar class in response to my request on
+# comp.lang.python, which I then proceeded to lose.  I wrote this class
+# instead, but I think he's released his code publicly since, could pinch the
+# tests from it, at least...
+
+# For testing seek_wrapper invariant (note that
+# test_urllib2.HandlerTest.test_seekable is expected to fail when this
+# invariant checking is turned on).  The invariant checking is done by module
+# ipdc, which is available here:
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
+## from ipdbc import ContractBase
+## class seek_wrapper(ContractBase):
+class seek_wrapper:
+    """Adds a seek method to a file object.
+
+    This is only designed for seeking on readonly file-like objects.
+
+    Wrapped file-like object must have a read method.  The readline method is
+    only supported if that method is present on the wrapped object.  The
+    readlines method is always supported.  xreadlines and iteration are
+    supported only for Python 2.2 and above.
+
+    Public attributes:
+
+    wrapped: the wrapped file object
+    is_closed: true iff .close() has been called
+
+    WARNING: All other attributes of the wrapped object (ie. those that are not
+    one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
+    are passed through unaltered, which may or may not make sense for your
+    particular file object.
+
+    """
+    # General strategy is to check that cache is full enough, then delegate to
+    # the cache (self.__cache, which is a cStringIO.StringIO instance).  A seek
+    # position (self.__pos) is maintained independently of the cache, in order
+    # that a single cache may be shared between multiple seek_wrapper objects.
+    # Copying using module copy shares the cache in this way.
+
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+        self.__read_complete_state = [False]
+        self.__is_closed_state = [False]
+        self.__have_readline = hasattr(self.wrapped, "readline")
+        self.__cache = StringIO()
+        self.__pos = 0  # seek position
+
+    def invariant(self):
+        # The end of the cache is always at the same place as the end of the
+        # wrapped file.
+        return self.wrapped.tell() == len(self.__cache.getvalue())
+
+    def close(self):
+        self.wrapped.close()
+        self.is_closed = True
+
+    def __getattr__(self, name):
+        if name == "is_closed":
+            return self.__is_closed_state[0]
+        elif name == "read_complete":
+            return self.__read_complete_state[0]
+
+        wrapped = self.__dict__.get("wrapped")
+        if wrapped:
+            return getattr(wrapped, name)
+
+        return getattr(self.__class__, name)
+
+    def __setattr__(self, name, value):
+        if name == "is_closed":
+            self.__is_closed_state[0] = bool(value)
+        elif name == "read_complete":
+            if not self.is_closed:
+                self.__read_complete_state[0] = bool(value)
+        else:
+            self.__dict__[name] = value
+
+    def seek(self, offset, whence=0):
+        assert whence in [0,1,2]
+
+        # how much data, if any, do we need to read?
+        if whence == 2:  # 2: relative to end of *wrapped* file
+            if offset < 0: raise ValueError("negative seek offset")
+            # since we don't know yet where the end of that file is, we must
+            # read everything
+            to_read = None
+        else:
+            if whence == 0:  # 0: absolute
+                if offset < 0: raise ValueError("negative seek offset")
+                dest = offset
+            else:  # 1: relative to current position
+                pos = self.__pos
+                if pos < offset:
+                    raise ValueError("seek to before start of file")
+                dest = pos + offset
+            end = len(self.__cache.getvalue())
+            to_read = dest - end
+            if to_read < 0:
+                to_read = 0
+
+        if to_read != 0:
+            self.__cache.seek(0, 2)
+            if to_read is None:
+                assert whence == 2
+                self.__cache.write(self.wrapped.read())
+                self.read_complete = True
+                self.__pos = self.__cache.tell() - offset
+            else:
+                data = self.wrapped.read(to_read)
+                if not data:
+                    self.read_complete = True
+                else:
+                    self.__cache.write(data)
+                # Don't raise an exception even if we've seek()ed past the end
+                # of .wrapped, since fseek() doesn't complain in that case.
+                # Also like fseek(), pretend we have seek()ed past the end,
+                # i.e. not:
+                #self.__pos = self.__cache.tell()
+                # but rather:
+                self.__pos = dest
+        else:
+            self.__pos = dest
+
+    def tell(self):
+        return self.__pos
+
+    def __copy__(self):
+        cpy = self.__class__(self.wrapped)
+        cpy.__cache = self.__cache
+        cpy.__read_complete_state = self.__read_complete_state
+        cpy.__is_closed_state = self.__is_closed_state
+        return cpy
+
+    def get_data(self):
+        pos = self.__pos
+        try:
+            self.seek(0)
+            return self.read(-1)
+        finally:
+            self.__pos = pos
+
+    def read(self, size=-1):
+        pos = self.__pos
+        end = len(self.__cache.getvalue())
+        available = end - pos
+
+        # enough data already cached?
+        if size <= available and size != -1:
+            self.__cache.seek(pos)
+            self.__pos = pos+size
+            return self.__cache.read(size)
+
+        # no, so read sufficient data from wrapped file and cache it
+        self.__cache.seek(0, 2)
+        if size == -1:
+            self.__cache.write(self.wrapped.read())
+            self.read_complete = True
+        else:
+            to_read = size - available
+            assert to_read > 0
+            data = self.wrapped.read(to_read)
+            if not data:
+                self.read_complete = True
+            else:
+                self.__cache.write(data)
+        self.__cache.seek(pos)
+
+        data = self.__cache.read(size)
+        self.__pos = self.__cache.tell()
+        assert self.__pos == pos + len(data)
+        return data
+
+    def readline(self, size=-1):
+        if not self.__have_readline:
+            raise NotImplementedError("no readline method on wrapped object")
+
+        # line we're about to read might not be complete in the cache, so
+        # read another line first
+        pos = self.__pos
+        self.__cache.seek(0, 2)
+        data = self.wrapped.readline()
+        if not data:
+            self.read_complete = True
+        else:
+            self.__cache.write(data)
+        self.__cache.seek(pos)
+
+        data = self.__cache.readline()
+        if size != -1:
+            r = data[:size]
+            self.__pos = pos+size
+        else:
+            r = data
+            self.__pos = pos+len(data)
+        return r
+
+    def readlines(self, sizehint=-1):
+        pos = self.__pos
+        self.__cache.seek(0, 2)
+        self.__cache.write(self.wrapped.read())
+        self.read_complete = True
+        self.__cache.seek(pos)
+        data = self.__cache.readlines(sizehint)
+        self.__pos = self.__cache.tell()
+        return data
+
+    def __iter__(self): return self
+    def next(self):
+        line = self.readline()
+        if line == "": raise StopIteration
+        return line
+
+    xreadlines = __iter__
+
+    def __repr__(self):
+        return ("<%s at %s whose wrapped object = %r>" %
+                (self.__class__.__name__, hex(abs(id(self))), self.wrapped))
+
+
+class response_seek_wrapper(seek_wrapper):
+
+    """
+    Supports copying response objects and setting response body data.
+
+    """
+
+    def __init__(self, wrapped):
+        seek_wrapper.__init__(self, wrapped)
+        self._headers = self.wrapped.info()
+
+    def __copy__(self):
+        cpy = seek_wrapper.__copy__(self)
+        # copy headers from delegate
+        cpy._headers = copy.copy(self.info())
+        return cpy
+
+    # Note that .info() and .geturl() (the only two urllib2 response methods
+    # that are not implemented by seek_wrapper) must be here explicitly rather
+    # than by seek_wrapper's __getattr__ delegation) so that the nasty
+    # dynamically-created HTTPError classes in get_seek_wrapper_class() get the
+    # wrapped object's implementation, and not HTTPError's.
+
+    def info(self):
+        return self._headers
+
+    def geturl(self):
+        return self.wrapped.geturl()
+
+    def set_data(self, data):
+        self.seek(0)
+        self.read()
+        self.close()
+        cache = self._seek_wrapper__cache = StringIO()
+        cache.write(data)
+        self.seek(0)
+
+
+class eoffile:
+    # file-like object that always claims to be at end-of-file...
+    def read(self, size=-1): return ""
+    def readline(self, size=-1): return ""
+    def __iter__(self): return self
+    def next(self): return ""
+    def close(self): pass
+
+class eofresponse(eoffile):
+    def __init__(self, url, headers, code, msg):
+        self._url = url
+        self._headers = headers
+        self.code = code
+        self.msg = msg
+    def geturl(self): return self._url
+    def info(self): return self._headers
+
+
+class closeable_response:
+    """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
+
+    Only supports responses returned by mechanize.HTTPHandler.
+
+    After .close(), the following methods are supported:
+
+    .read()
+    .readline()
+    .info()
+    .geturl()
+    .__iter__()
+    .next()
+    .close()
+
+    and the following attributes are supported:
+
+    .code
+    .msg
+
+    Also supports pickling (but the stdlib currently does something to prevent
+    it: http://python.org/sf/1144636).
+
+    """
+    # presence of this attr indicates is useable after .close()
+    closeable_response = None
+
+    def __init__(self, fp, headers, url, code, msg):
+        self._set_fp(fp)
+        self._headers = headers
+        self._url = url
+        self.code = code
+        self.msg = msg
+
+    def _set_fp(self, fp):
+        self.fp = fp
+        self.read = self.fp.read
+        self.readline = self.fp.readline
+        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
+        if hasattr(self.fp, "fileno"):
+            self.fileno = self.fp.fileno
+        else:
+            self.fileno = lambda: None
+        self.__iter__ = self.fp.__iter__
+        self.next = self.fp.next
+
+    def __repr__(self):
+        return '<%s at %s whose fp = %r>' % (
+            self.__class__.__name__, hex(abs(id(self))), self.fp)
+
+    def info(self):
+        return self._headers
+
+    def geturl(self):
+        return self._url
+
+    def close(self):
+        wrapped = self.fp
+        wrapped.close()
+        new_wrapped = eofresponse(
+            self._url, self._headers, self.code, self.msg)
+        self._set_fp(new_wrapped)
+
+    def __getstate__(self):
+        # There are three obvious options here:
+        # 1. truncate
+        # 2. read to end
+        # 3. close socket, pickle state including read position, then open
+        #    again on unpickle and use Range header
+        # XXXX um, 4. refuse to pickle unless .close()d.  This is better,
+        #  actually ("errors should never pass silently").  Pickling doesn't
+        #  work anyway ATM, because of http://python.org/sf/1144636 so fix
+        #  this later
+
+        # 2 breaks pickle protocol, because one expects the original object
+        # to be left unscathed by pickling.  3 is too complicated and
+        # surprising (and too much work ;-) to happen in a sane __getstate__.
+        # So we do 1.
+
+        state = self.__dict__.copy()
+        new_wrapped = eofresponse(
+            self._url, self._headers, self.code, self.msg)
+        state["wrapped"] = new_wrapped
+        return state
+
+def test_response(data='test data', headers=[],
+                  url="http://example.com/", code=200, msg="OK"):
+    return make_response(data, headers, url, code, msg)
+
+def test_html_response(data='test data', headers=[],
+                       url="http://example.com/", code=200, msg="OK"):
+    headers += [("Content-type", "text/html")]
+    return make_response(data, headers, url, code, msg)
+
+def make_response(data, headers, url, code, msg):
+    """Convenient factory for objects implementing response interface.
+
+    data: string containing response body data
+    headers: sequence of (name, value) pairs
+    url: URL of response
+    code: integer response code (e.g. 200)
+    msg: string response code message (e.g. "OK")
+
+    """
+    mime_headers = make_headers(headers)
+    r = closeable_response(StringIO(data), mime_headers, url, code, msg)
+    return response_seek_wrapper(r)
+
+
+def make_headers(headers):
+    """
+    headers: sequence of (name, value) pairs
+    """
+    hdr_text = []
+    for name_value in headers:
+        hdr_text.append("%s: %s" % name_value)
+    return mimetools.Message(StringIO("\n".join(hdr_text)))
+
+
+# Rest of this module is especially horrible, but needed, at least until fork
+# urllib2.  Even then, may want to preseve urllib2 compatibility.
+
+def get_seek_wrapper_class(response):
+    # in order to wrap response objects that are also exceptions, we must
+    # dynamically subclass the exception :-(((
+    if (isinstance(response, urllib2.HTTPError) and
+        not hasattr(response, "seek")):
+        if response.__class__.__module__ == "__builtin__":
+            exc_class_name = response.__class__.__name__
+        else:
+            exc_class_name = "%s.%s" % (
+                response.__class__.__module__, response.__class__.__name__)
+
+        class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
+            # this only derives from HTTPError in order to be a subclass --
+            # the HTTPError behaviour comes from delegation
+
+            _exc_class_name = exc_class_name
+
+            def __init__(self, wrapped):
+                response_seek_wrapper.__init__(self, wrapped)
+                # be compatible with undocumented HTTPError attributes :-(
+                self.hdrs = wrapped.info()
+                self.filename = wrapped.geturl()
+
+            def __repr__(self):
+                return (
+                    "<%s (%s instance) at %s "
+                    "whose wrapped object = %r>" % (
+                    self.__class__.__name__, self._exc_class_name,
+                    hex(abs(id(self))), self.wrapped)
+                    )
+        wrapper_class = httperror_seek_wrapper
+    else:
+        wrapper_class = response_seek_wrapper
+    return wrapper_class
+
+def seek_wrapped_response(response):
+    """Return a copy of response that supports seekable response interface.
+
+    Accepts responses from both mechanize and urllib2 handlers.
+
+    Copes with both oridinary response instances and HTTPError instances (which
+    can't be simply wrapped due to the requirement of preserving the exception
+    base class).
+    """
+    if not hasattr(response, "seek"):
+        wrapper_class = get_seek_wrapper_class(response)
+        response = wrapper_class(response)
+    assert hasattr(response, "get_data")
+    return response
+
+def upgrade_response(response):
+    """Return a copy of response that supports Browser response interface.
+
+    Browser response interface is that of "seekable responses"
+    (response_seek_wrapper), plus the requirement that responses must be
+    useable after .close() (closeable_response).
+
+    Accepts responses from both mechanize and urllib2 handlers.
+
+    Copes with both ordinary response instances and HTTPError instances (which
+    can't be simply wrapped due to the requirement of preserving the exception
+    base class).
+    """
+    wrapper_class = get_seek_wrapper_class(response)
+    if hasattr(response, "closeable_response"):
+        if not hasattr(response, "seek"):
+            response = wrapper_class(response)
+        assert hasattr(response, "get_data")
+        return copy.copy(response)
+
+    # a urllib2 handler constructed the response, i.e. the response is an
+    # urllib.addinfourl or a urllib2.HTTPError, instead of a
+    # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
+    try:
+        code = response.code
+    except AttributeError:
+        code = None
+    try:
+        msg = response.msg
+    except AttributeError:
+        msg = None
+
+    # may have already-.read() data from .seek() cache
+    data = None
+    get_data = getattr(response, "get_data", None)
+    if get_data:
+        data = get_data()
+
+    response = closeable_response(
+        response.fp, response.info(), response.geturl(), code, msg)
+    response = wrapper_class(response)
+    if data:
+        response.set_data(data)
+    return response


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,240 @@
+"""RFC 3986 URI parsing and relative reference resolution / absolutization.
+
+(aka splitting and joining)
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+# XXX Wow, this is ugly.  Overly-direct translation of the RFC ATM.
+
+import sys, re, posixpath, urllib
+
+## def chr_range(a, b):
+##     return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+##                         "abcdefghijklmnopqrstuvwxyz"
+##                         "0123456789"
+##                         "-_.~")
+## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
+## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
+# this re matches any character that's not in URI_CHARS
+BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
+
+
+def clean_url(url, encoding):
+    # percent-encode illegal URI characters
+    # Trying to come up with test cases for this gave me a headache, revisit
+    # when do switch to unicode.
+    # Somebody else's comments (lost the attribution):
+##     - IE will return you the url in the encoding you send it
+##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
+##     characters in your link. It will send you utf-8 however if there are...
+    if type(url) == type(""):
+        url = url.decode(encoding, "replace")
+    url = url.strip()
+    # for second param to urllib.quote(), we want URI_CHARS, minus the
+    # 'always_safe' characters that urllib.quote() never percent-encodes
+    return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
+
+def is_clean_uri(uri):
+    """
+    >>> is_clean_uri("ABC!")
+    True
+    >>> is_clean_uri(u"ABC!")
+    True
+    >>> is_clean_uri("ABC|")
+    False
+    >>> is_clean_uri(u"ABC|")
+    False
+    >>> is_clean_uri("http://example.com/0")
+    True
+    >>> is_clean_uri(u"http://example.com/0")
+    True
+    """
+    # note module re treats bytestrings as through they were decoded as latin-1
+    # so this function accepts both unicode and bytestrings
+    return not bool(BAD_URI_CHARS_RE.search(uri))
+
+
+SPLIT_MATCH = re.compile(
+    r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
+def urlsplit(absolute_uri):
+    """Return scheme, authority, path, query, fragment."""
+    match = SPLIT_MATCH(absolute_uri)
+    if match:
+        g = match.groups()
+        return g[1], g[3], g[4], g[6], g[8]
+
+def urlunsplit(parts):
+    scheme, authority, path, query, fragment = parts
+    r = []
+    append = r.append
+    if scheme is not None:
+        append(scheme)
+        append(":")
+    if authority is not None:
+        append("//")
+        append(authority)
+    append(path)
+    if query is not None:
+        append("?")
+        append(query)
+    if fragment is not None:
+        append("#")
+        append(fragment)
+    return "".join(r)
+
+def urljoin(base_uri, uri_reference):
+    return urlunsplit(urljoin_parts(urlsplit(base_uri),
+                                    urlsplit(uri_reference)))
+
+# oops, this doesn't do the same thing as the literal translation
+# from the RFC below
+## def urljoin_parts(base_parts, reference_parts):
+##     scheme, authority, path, query, fragment = base_parts
+##     rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+##     # compute target URI path
+##     if rpath == "":
+##         tpath = path
+##     else:
+##         tpath = rpath
+##         if not tpath.startswith("/"):
+##             tpath = merge(authority, path, tpath)
+##         tpath = posixpath.normpath(tpath)
+
+##     if rscheme is not None:
+##         return (rscheme, rauthority, tpath, rquery, rfragment)
+##     elif rauthority is not None:
+##         return (scheme, rauthority, tpath, rquery, rfragment)
+##     elif rpath == "":
+##         if rquery is not None:
+##             tquery = rquery
+##         else:
+##             tquery = query
+##         return (scheme, authority, tpath, tquery, rfragment)
+##     else:
+##         return (scheme, authority, tpath, rquery, rfragment)
+
+def urljoin_parts(base_parts, reference_parts):
+    scheme, authority, path, query, fragment = base_parts
+    rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+    if rscheme == scheme:
+        rscheme = None
+
+    if rscheme is not None:
+        tscheme, tauthority, tpath, tquery = (
+            rscheme, rauthority, remove_dot_segments(rpath), rquery)
+    else:
+        if rauthority is not None:
+            tauthority, tpath, tquery = (
+                rauthority, remove_dot_segments(rpath), rquery)
+        else:
+            if rpath == "":
+                tpath = path
+                if rquery is not None:
+                    tquery = rquery
+                else:
+                    tquery = query
+            else:
+                if rpath.startswith("/"):
+                    tpath = remove_dot_segments(rpath)
+                else:
+                    tpath = merge(authority, path, rpath)
+                    tpath = remove_dot_segments(tpath)
+                tquery = rquery
+            tauthority = authority
+        tscheme = scheme
+    tfragment = rfragment
+    return (tscheme, tauthority, tpath, tquery, tfragment)
+
+# um, something *vaguely* like this is what I want, but I have to generate
+# lots of test cases first, if only to understand what it is that
+# remove_dot_segments really does...
+## def remove_dot_segments(path):
+##     if path == '':
+##         return ''
+##     comps = path.split('/')
+##     new_comps = []
+##     for comp in comps:
+##         if comp in ['.', '']:
+##             if not new_comps or new_comps[-1]:
+##                 new_comps.append('')
+##             continue
+##         if comp != '..':
+##             new_comps.append(comp)
+##         elif new_comps:
+##             new_comps.pop()
+##     return '/'.join(new_comps)
+
+
+def remove_dot_segments(path):
+    r = []
+    while path:
+        # A
+        if path.startswith("../"):
+            path = path[3:]
+            continue
+        if path.startswith("./"):
+            path = path[2:]
+            continue
+        # B
+        if path.startswith("/./"):
+            path = path[2:]
+            continue
+        if path == "/.":
+            path = "/"
+            continue
+        # C
+        if path.startswith("/../"):
+            path = path[3:]
+            if r:
+                r.pop()
+            continue
+        if path == "/..":
+            path = "/"
+            if r:
+                r.pop()
+            continue
+        # D
+        if path == ".":
+            path = path[1:]
+            continue
+        if path == "..":
+            path = path[2:]
+            continue
+        # E
+        start = 0
+        if path.startswith("/"):
+            start = 1
+        ii = path.find("/", start)
+        if ii < 0:
+            ii = None
+        r.append(path[:ii])
+        if ii is None:
+            break
+        path = path[ii:]
+    return "".join(r)
+
+def merge(base_authority, base_path, ref_path):
+    # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
+    # doesn't even take base_authority as a parameter, despite the wording in
+    # the RFC suggesting otherwise.  Perhaps I'm missing some obvious identity.
+    #if base_authority is not None and base_path == "":
+    if base_path == "":
+        return "/" + ref_path
+    ii = base_path.rfind("/")
+    if ii >= 0:
+        return base_path[:ii+1] + ref_path
+    return ref_path
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,16 @@
+from urllib2 import BaseHandler
+from _util import deprecation
+from _response import response_seek_wrapper
+
+
+class SeekableProcessor(BaseHandler):
+    """Deprecated: Make responses seekable."""
+
+    def __init__(self):
+        deprecation(
+            "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
+
+    def any_response(self, request, response):
+        if not hasattr(response, "seek"):
+            return response_seek_wrapper(response)
+        return response


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py	                        (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,40 @@
+from urllib2 import BaseHandler
+
+from _request import Request
+from _response import upgrade_response
+from _util import deprecation
+
+
+class HTTPRequestUpgradeProcessor(BaseHandler):
+    # upgrade urllib2.Request to this module's Request
+    # yuck!
+    handler_order = 0  # before anything else
+
+    def http_request(self, request):
+        if not hasattr(request, "add_unredirected_header"):
+            newrequest = Request(request._Request__original, request.data,
+                                 request.headers)
+            try: newrequest.origin_req_host = request.origin_req_host
+            except AttributeError: pass
+            try: newrequest.unverifiable = request.unverifiable
+            except AttributeError: pass
+            try: newrequest.visit = request.visit
+            except AttributeError: pass
+            request = newrequest
+        return request
+
+    https_request = http_request
+
+
+class ResponseUpgradeProcessor(BaseHandler):
+    # upgrade responses to be .close()able without becoming unusable
+    handler_order = 0  # before anything else
+
+    def __init__(self):
+        deprecation(
+            "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
+
+    def any_response(self, request, response):
+        if not hasattr(response, 'closeable_response'):
+            response = upgrade_response(response)
+        return response


Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
___________________________________________________________________
Name: svn:keywords
   + Date Author Id Revision
Name: svn:eol-style
   + native

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -3,51 +3,60 @@
 from urllib2 import \
      URLError, \
      HTTPError, \
-     GopherError, \
+     GopherError
+# ...and from mechanize
+from _opener import OpenerDirector, \
+     SeekableResponseOpener, \
+     build_opener, install_opener, urlopen
+from _auth import \
      HTTPPasswordMgr, \
      HTTPPasswordMgrWithDefaultRealm, \
      AbstractBasicAuthHandler, \
-     AbstractDigestAuthHandler
-# ...and from mechanize
-from _opener import OpenerDirector
-from _auth import \
+     AbstractDigestAuthHandler, \
      HTTPProxyPasswordMgr, \
      ProxyHandler, \
      ProxyBasicAuthHandler, \
      ProxyDigestAuthHandler, \
      HTTPBasicAuthHandler, \
-     HTTPDigestAuthHandler
-from _urllib2_support import \
-     Request, \
-     build_opener, install_opener, urlopen, \
-     OpenerFactory, urlretrieve, \
+     HTTPDigestAuthHandler, \
+     HTTPSClientCertMgr
+from _request import \
+     Request
+from _http import \
      RobotExclusionError
 
 # handlers...
 # ...from urllib2...
 from urllib2 import \
      BaseHandler, \
-     HTTPDefaultErrorHandler, \
      UnknownHandler, \
      FTPHandler, \
      CacheFTPHandler, \
      FileHandler, \
      GopherHandler
 # ...and from mechanize
-from _urllib2_support import \
+from _http import \
      HTTPHandler, \
+     HTTPDefaultErrorHandler, \
      HTTPRedirectHandler, \
-     HTTPRequestUpgradeProcessor, \
      HTTPEquivProcessor, \
-     SeekableProcessor, \
      HTTPCookieProcessor, \
      HTTPRefererProcessor, \
      HTTPRefreshProcessor, \
      HTTPErrorProcessor, \
+     HTTPRobotRulesProcessor
+from _upgrade import \
+     HTTPRequestUpgradeProcessor, \
+     ResponseUpgradeProcessor
+from _debug import \
      HTTPResponseDebugProcessor, \
-     HTTPRedirectDebugProcessor, \
-     HTTPRobotRulesProcessor
+     HTTPRedirectDebugProcessor
+from _seek import \
+     SeekableProcessor
+# crap ATM
+## from _gzip import \
+##      HTTPGzipProcessor
 import httplib
 if hasattr(httplib, 'HTTPS'):
-    from _urllib2_support import HTTPSHandler
+    from _http import HTTPSHandler
 del httplib

Deleted: Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,718 +0,0 @@
-"""Integration with Python standard library module urllib2.
-
-Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
-the META HTTP-EQUIV tag contents, and following Refresh header redirects.
-
-Copyright 2002-2006 John J Lee <jjl at pobox.com>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD or ZPL 2.1 licenses (see the file
-COPYING.txt included with the distribution).
-
-"""
-
-import copy, time, tempfile, htmlentitydefs, re, logging, types, \
-       string, socket, urlparse, urllib2, urllib, httplib, sgmllib
-from urllib2 import URLError, HTTPError, BaseHandler
-from cStringIO import StringIO
-try:
-    import threading as _threading
-except ImportError:
-    import dummy_threading as _threading
-
-import _opener
-from _request import Request
-from _util import isstringlike, startswith, \
-     getheaders, closeable_response, response_seek_wrapper
-from _html import unescape, unescape_charref
-from _headersutil import is_html
-from _clientcookie import CookieJar, request_host
-
-debug = logging.getLogger("mechanize.cookies").debug
-
-
-CHUNK = 1024  # size of chunks fed to HTML HEAD parser, in bytes
-DEFAULT_ENCODING = 'latin-1'
-
-
-# This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
-#  (http://www.python.org/sf/549151)
-# 2.2.3 is broken here (my fault!), 2.3 is fixed.
-class HTTPRedirectHandler(BaseHandler):
-    # maximum number of redirections to any single URL
-    # this is needed because of the state that cookies introduce
-    max_repeats = 4
-    # maximum total number of redirections (regardless of URL) before
-    # assuming we're in a loop
-    max_redirections = 10
-
-    # Implementation notes:
-
-    # To avoid the server sending us into an infinite loop, the request
-    # object needs to track what URLs we have already seen.  Do this by
-    # adding a handler-specific attribute to the Request object.  The value
-    # of the dict is used to count the number of times the same URL has
-    # been visited.  This is needed because visiting the same URL twice
-    # does not necessarily imply a loop, thanks to state introduced by
-    # cookies.
-
-    # Always unhandled redirection codes:
-    # 300 Multiple Choices: should not handle this here.
-    # 304 Not Modified: no need to handle here: only of interest to caches
-    #     that do conditional GETs
-    # 305 Use Proxy: probably not worth dealing with here
-    # 306 Unused: what was this for in the previous versions of protocol??
-
-    def redirect_request(self, newurl, req, fp, code, msg, headers):
-        """Return a Request or None in response to a redirect.
-
-        This is called by the http_error_30x methods when a redirection
-        response is received.  If a redirection should take place, return a
-        new Request to allow http_error_30x to perform the redirect;
-        otherwise, return None to indicate that an HTTPError should be
-        raised.
-
-        """
-        if code in (301, 302, 303, "refresh") or \
-               (code == 307 and not req.has_data()):
-            # Strictly (according to RFC 2616), 301 or 302 in response to
-            # a POST MUST NOT cause a redirection without confirmation
-            # from the user (of urllib2, in this case).  In practice,
-            # essentially all clients do redirect in this case, so we do
-            # the same.
-            return Request(newurl,
-                           headers=req.headers,
-                           origin_req_host=req.get_origin_req_host(),
-                           unverifiable=True)
-        else:
-            raise HTTPError(req.get_full_url(), code, msg, headers, fp)
-
-    def http_error_302(self, req, fp, code, msg, headers):
-        # Some servers (incorrectly) return multiple Location headers
-        # (so probably same goes for URI).  Use first header.
-        if headers.has_key('location'):
-            newurl = getheaders(headers, 'location')[0]
-        elif headers.has_key('uri'):
-            newurl = getheaders(headers, 'uri')[0]
-        else:
-            return
-        newurl = urlparse.urljoin(req.get_full_url(), newurl)
-
-        # XXX Probably want to forget about the state of the current
-        # request, although that might interact poorly with other
-        # handlers that also use handler-specific request attributes
-        new = self.redirect_request(newurl, req, fp, code, msg, headers)
-        if new is None:
-            return
-
-        # loop detection
-        # .redirect_dict has a key url if url was previously visited.
-        if hasattr(req, 'redirect_dict'):
-            visited = new.redirect_dict = req.redirect_dict
-            if (visited.get(newurl, 0) >= self.max_repeats or
-                len(visited) >= self.max_redirections):
-                raise HTTPError(req.get_full_url(), code,
-                                self.inf_msg + msg, headers, fp)
-        else:
-            visited = new.redirect_dict = req.redirect_dict = {}
-        visited[newurl] = visited.get(newurl, 0) + 1
-
-        # Don't close the fp until we are sure that we won't use it
-        # with HTTPError.  
-        fp.read()
-        fp.close()
-
-        return self.parent.open(new)
-
-    http_error_301 = http_error_303 = http_error_307 = http_error_302
-    http_error_refresh = http_error_302
-
-    inf_msg = "The HTTP server returned a redirect error that would " \
-              "lead to an infinite loop.\n" \
-              "The last 30x error message was:\n"
-
-
-class HTTPRequestUpgradeProcessor(BaseHandler):
-    # upgrade urllib2.Request to this module's Request
-    # yuck!
-    handler_order = 0  # before anything else
-
-    def http_request(self, request):
-        if not hasattr(request, "add_unredirected_header"):
-            newrequest = Request(request._Request__original, request.data,
-                                 request.headers)
-            try: newrequest.origin_req_host = request.origin_req_host
-            except AttributeError: pass
-            try: newrequest.unverifiable = request.unverifiable
-            except AttributeError: pass
-            request = newrequest
-        return request
-
-    https_request = http_request
-
-# XXX would self.reset() work, instead of raising this exception?
-class EndOfHeadError(Exception): pass
-class AbstractHeadParser:
-    # only these elements are allowed in or before HEAD of document
-    head_elems = ("html", "head",
-                  "title", "base",
-                  "script", "style", "meta", "link", "object")
-    _entitydefs = htmlentitydefs.name2codepoint
-    _encoding = DEFAULT_ENCODING
-
-    def __init__(self):
-        self.http_equiv = []
-
-    def start_meta(self, attrs):
-        http_equiv = content = None
-        for key, value in attrs:
-            if key == "http-equiv":
-                http_equiv = self.unescape_attr_if_required(value)
-            elif key == "content":
-                content = self.unescape_attr_if_required(value)
-        if http_equiv is not None:
-            self.http_equiv.append((http_equiv, content))
-
-    def end_head(self):
-        raise EndOfHeadError()
-
-    def handle_entityref(self, name):
-        #debug("%s", name)
-        self.handle_data(unescape(
-            '&%s;' % name, self._entitydefs, self._encoding))
-
-    def handle_charref(self, name):
-        #debug("%s", name)
-        self.handle_data(unescape_charref(name, self._encoding))
-
-    def unescape_attr(self, name):
-        #debug("%s", name)
-        return unescape(name, self._entitydefs, self._encoding)
-
-    def unescape_attrs(self, attrs):
-        #debug("%s", attrs)
-        escaped_attrs = {}
-        for key, val in attrs.items():
-            escaped_attrs[key] = self.unescape_attr(val)
-        return escaped_attrs
-
-    def unknown_entityref(self, ref):
-        self.handle_data("&%s;" % ref)
-
-    def unknown_charref(self, ref):
-        self.handle_data("&#%s;" % ref)
-
-
-try:
-    import HTMLParser
-except ImportError:
-    pass
-else:
-    class XHTMLCompatibleHeadParser(AbstractHeadParser,
-                                    HTMLParser.HTMLParser):
-        def __init__(self):
-            HTMLParser.HTMLParser.__init__(self)
-            AbstractHeadParser.__init__(self)
-
-        def handle_starttag(self, tag, attrs):
-            if tag not in self.head_elems:
-                raise EndOfHeadError()
-            try:
-                method = getattr(self, 'start_' + tag)
-            except AttributeError:
-                try:
-                    method = getattr(self, 'do_' + tag)
-                except AttributeError:
-                    pass # unknown tag
-                else:
-                    method(attrs)
-            else:
-                method(attrs)
-
-        def handle_endtag(self, tag):
-            if tag not in self.head_elems:
-                raise EndOfHeadError()
-            try:
-                method = getattr(self, 'end_' + tag)
-            except AttributeError:
-                pass # unknown tag
-            else:
-                method()
-
-        def unescape(self, name):
-            # Use the entitydefs passed into constructor, not
-            # HTMLParser.HTMLParser's entitydefs.
-            return self.unescape_attr(name)
-
-        def unescape_attr_if_required(self, name):
-            return name  # HTMLParser.HTMLParser already did it
-
-class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
-
-    def _not_called(self):
-        assert False
-
-    def __init__(self):
-        sgmllib.SGMLParser.__init__(self)
-        AbstractHeadParser.__init__(self)
-
-    def handle_starttag(self, tag, method, attrs):
-        if tag not in self.head_elems:
-            raise EndOfHeadError()
-        if tag == "meta":
-            method(attrs)
-
-    def unknown_starttag(self, tag, attrs):
-        self.handle_starttag(tag, self._not_called, attrs)
-
-    def handle_endtag(self, tag, method):
-        if tag in self.head_elems:
-            method()
-        else:
-            raise EndOfHeadError()
-
-    def unescape_attr_if_required(self, name):
-        return self.unescape_attr(name)
-
-def parse_head(fileobj, parser):
-    """Return a list of key, value pairs."""
-    while 1:
-        data = fileobj.read(CHUNK)
-        try:
-            parser.feed(data)
-        except EndOfHeadError:
-            break
-        if len(data) != CHUNK:
-            # this should only happen if there is no HTML body, or if
-            # CHUNK is big
-            break
-    return parser.http_equiv
-
-class HTTPEquivProcessor(BaseHandler):
-    """Append META HTTP-EQUIV headers to regular HTTP headers."""
-
-    handler_order = 300  # before handlers that look at HTTP headers
-
-    def __init__(self, head_parser_class=HeadParser,
-                 i_want_broken_xhtml_support=False,
-                 ):
-        self.head_parser_class = head_parser_class
-        self._allow_xhtml = i_want_broken_xhtml_support
-
-    def http_response(self, request, response):
-        if not hasattr(response, "seek"):
-            response = response_seek_wrapper(response)
-        headers = response.info()
-        url = response.geturl()
-        ct_hdrs = getheaders(response.info(), "content-type")
-        if is_html(ct_hdrs, url, self._allow_xhtml):
-            try:
-                try:
-                    html_headers = parse_head(response, self.head_parser_class())
-                finally:
-                    response.seek(0)
-            except (HTMLParser.HTMLParseError,
-                    sgmllib.SGMLParseError):
-                pass
-            else:
-                for hdr, val in html_headers:
-                    # rfc822.Message interprets this as appending, not clobbering
-                    headers[hdr] = val
-        return response
-
-    https_response = http_response
-
-class SeekableProcessor(BaseHandler):
-    """Make responses seekable."""
-
-    def any_response(self, request, response):
-        if not hasattr(response, "seek"):
-            return response_seek_wrapper(response)
-        return response
-
-class HTTPCookieProcessor(BaseHandler):
-    """Handle HTTP cookies.
-
-    Public attributes:
-
-    cookiejar: CookieJar instance
-
-    """
-    def __init__(self, cookiejar=None):
-        if cookiejar is None:
-            cookiejar = CookieJar()
-        self.cookiejar = cookiejar
-
-    def http_request(self, request):
-        self.cookiejar.add_cookie_header(request)
-        return request
-
-    def http_response(self, request, response):
-        self.cookiejar.extract_cookies(response, request)
-        return response
-
-    https_request = http_request
-    https_response = http_response
-
-try:
-    import robotparser
-except ImportError:
-    pass
-else:
-    class RobotExclusionError(urllib2.HTTPError):
-        def __init__(self, request, *args):
-            apply(urllib2.HTTPError.__init__, (self,)+args)
-            self.request = request
-
-    class HTTPRobotRulesProcessor(BaseHandler):
-        # before redirections, after everything else
-        handler_order = 800
-
-        try:
-            from httplib import HTTPMessage
-        except:
-            from mimetools import Message
-            http_response_class = Message
-        else:
-            http_response_class = HTTPMessage
-
-        def __init__(self, rfp_class=robotparser.RobotFileParser):
-            self.rfp_class = rfp_class
-            self.rfp = None
-            self._host = None
-
-        def http_request(self, request):
-            host = request.get_host()
-            scheme = request.get_type()
-            if host != self._host:
-                self.rfp = self.rfp_class()
-                self.rfp.set_url(scheme+"://"+host+"/robots.txt")
-                self.rfp.read()
-                self._host = host
-
-            ua = request.get_header("User-agent", "")
-            if self.rfp.can_fetch(ua, request.get_full_url()):
-                return request
-            else:
-                msg = "request disallowed by robots.txt"
-                raise RobotExclusionError(
-                    request,
-                    request.get_full_url(),
-                    403, msg,
-                    self.http_response_class(StringIO()), StringIO(msg))
-
-        https_request = http_request
-
-class HTTPRefererProcessor(BaseHandler):
-    """Add Referer header to requests.
-
-    This only makes sense if you use each RefererProcessor for a single
-    chain of requests only (so, for example, if you use a single
-    HTTPRefererProcessor to fetch a series of URLs extracted from a single
-    page, this will break).
-
-    There's a proper implementation of this in module mechanize.
-
-    """
-    def __init__(self):
-        self.referer = None
-
-    def http_request(self, request):
-        if ((self.referer is not None) and
-            not request.has_header("Referer")):
-            request.add_unredirected_header("Referer", self.referer)
-        return request
-
-    def http_response(self, request, response):
-        self.referer = response.geturl()
-        return response
-
-    https_request = http_request
-    https_response = http_response
-
-class HTTPResponseDebugProcessor(BaseHandler):
-    handler_order = 900  # before redirections, after everything else
-
-    def http_response(self, request, response):
-        if not hasattr(response, "seek"):
-            response = response_seek_wrapper(response)
-        info = getLogger("mechanize.http_responses").info
-        try:
-            info(response.read())
-        finally:
-            response.seek(0)
-        info("*****************************************************")
-        return response
-
-    https_response = http_response
-
-class HTTPRedirectDebugProcessor(BaseHandler):
-    def http_request(self, request):
-        if hasattr(request, "redirect_dict"):
-            info = getLogger("mechanize.http_redirects").info
-            info("redirecting to %s", request.get_full_url())
-        return request
-
-class HTTPRefreshProcessor(BaseHandler):
-    """Perform HTTP Refresh redirections.
-
-    Note that if a non-200 HTTP code has occurred (for example, a 30x
-    redirect), this processor will do nothing.
-
-    By default, only zero-time Refresh headers are redirected.  Use the
-    max_time attribute / constructor argument to allow Refresh with longer
-    pauses.  Use the honor_time attribute / constructor argument to control
-    whether the requested pause is honoured (with a time.sleep()) or
-    skipped in favour of immediate redirection.
-
-    Public attributes:
-
-    max_time: see above
-    honor_time: see above
-
-    """
-    handler_order = 1000
-
-    def __init__(self, max_time=0, honor_time=True):
-        self.max_time = max_time
-        self.honor_time = honor_time
-
-    def http_response(self, request, response):
-        code, msg, hdrs = response.code, response.msg, response.info()
-
-        if code == 200 and hdrs.has_key("refresh"):
-            refresh = getheaders(hdrs, "refresh")[0]
-            ii = string.find(refresh, ";")
-            if ii != -1:
-                pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
-                jj = string.find(newurl_spec, "=")
-                if jj != -1:
-                    key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
-                if key.strip().lower() != "url":
-                    debug("bad Refresh header: %r" % refresh)
-                    return response
-            else:
-                pause, newurl = float(refresh), response.geturl()
-            if (self.max_time is None) or (pause <= self.max_time):
-                if pause > 1E-3 and self.honor_time:
-                    time.sleep(pause)
-                hdrs["location"] = newurl
-                # hardcoded http is NOT a bug
-                response = self.parent.error(
-                    "http", request, response,
-                    "refresh", msg, hdrs)
-
-        return response
-
-    https_response = http_response
-
-class HTTPErrorProcessor(BaseHandler):
-    """Process HTTP error responses.
-
-    The purpose of this handler is to to allow other response processors a
-    look-in by removing the call to parent.error() from
-    AbstractHTTPHandler.
-
-    For non-200 error codes, this just passes the job on to the
-    Handler.<proto>_error_<code> methods, via the OpenerDirector.error
-    method.  Eventually, urllib2.HTTPDefaultErrorHandler will raise an
-    HTTPError if no other handler handles the error.
-
-    """
-    handler_order = 1000  # after all other processors
-
-    def http_response(self, request, response):
-        code, msg, hdrs = response.code, response.msg, response.info()
-
-        if code != 200:
-            # hardcoded http is NOT a bug
-            response = self.parent.error(
-                "http", request, response, code, msg, hdrs)
-
-        return response
-
-    https_response = http_response
-
-
-class AbstractHTTPHandler(BaseHandler):
-
-    def __init__(self, debuglevel=0):
-        self._debuglevel = debuglevel
-
-    def set_http_debuglevel(self, level):
-        self._debuglevel = level
-
-    def do_request_(self, request):
-        host = request.get_host()
-        if not host:
-            raise URLError('no host given')
-
-        if request.has_data():  # POST
-            data = request.get_data()
-            if not request.has_header('Content-type'):
-                request.add_unredirected_header(
-                    'Content-type',
-                    'application/x-www-form-urlencoded')
-
-        scheme, sel = urllib.splittype(request.get_selector())
-        sel_host, sel_path = urllib.splithost(sel)
-        if not request.has_header('Host'):
-            request.add_unredirected_header('Host', sel_host or host)
-        for name, value in self.parent.addheaders:
-            name = string.capitalize(name)
-            if not request.has_header(name):
-                request.add_unredirected_header(name, value)
-
-        return request
-
-    def do_open(self, http_class, req):
-        """Return an addinfourl object for the request, using http_class.
-
-        http_class must implement the HTTPConnection API from httplib.
-        The addinfourl return value is a file-like object.  It also
-        has methods and attributes including:
-            - info(): return a mimetools.Message object for the headers
-            - geturl(): return the original request URL
-            - code: HTTP status code
-        """
-        host = req.get_host()
-        if not host:
-            raise URLError('no host given')
-
-        h = http_class(host) # will parse host:port
-        h.set_debuglevel(self._debuglevel)
-
-        headers = req.headers.copy()
-        headers.update(req.unredirected_hdrs)
-        # We want to make an HTTP/1.1 request, but the addinfourl
-        # class isn't prepared to deal with a persistent connection.
-        # It will try to read all remaining data from the socket,
-        # which will block while the server waits for the next request.
-        # So make sure the connection gets closed after the (only)
-        # request.
-        headers["Connection"] = "close"
-        try:
-            h.request(req.get_method(), req.get_selector(), req.data, headers)
-            r = h.getresponse()
-        except socket.error, err: # XXX what error?
-            raise URLError(err)
-
-        # Pick apart the HTTPResponse object to get the addinfourl
-        # object initialized properly.
-
-        # Wrap the HTTPResponse object in socket's file object adapter
-        # for Windows.  That adapter calls recv(), so delegate recv()
-        # to read().  This weird wrapping allows the returned object to
-        # have readline() and readlines() methods.
-
-        # XXX It might be better to extract the read buffering code
-        # out of socket._fileobject() and into a base class.
-
-        r.recv = r.read
-        fp = socket._fileobject(r, 'rb', -1)
-
-        resp = closeable_response(fp, r.msg, req.get_full_url(),
-                                  r.status, r.reason)
-        return resp
-
-
-class HTTPHandler(AbstractHTTPHandler):
-    def http_open(self, req):
-        return self.do_open(httplib.HTTPConnection, req)
-
-    http_request = AbstractHTTPHandler.do_request_
-
-if hasattr(httplib, 'HTTPS'):
-    class HTTPSHandler(AbstractHTTPHandler):
-        def https_open(self, req):
-            return self.do_open(httplib.HTTPSConnection, req)
-
-        https_request = AbstractHTTPHandler.do_request_
-
-class OpenerFactory:
-    """This class's interface is quite likely to change."""
-
-    default_classes = [
-        # handlers
-        urllib2.ProxyHandler,
-        urllib2.UnknownHandler,
-        HTTPHandler,  # from this module (derived from new AbstractHTTPHandler)
-        urllib2.HTTPDefaultErrorHandler,
-        HTTPRedirectHandler,  # from this module (bugfixed)
-        urllib2.FTPHandler,
-        urllib2.FileHandler,
-        # processors
-        HTTPRequestUpgradeProcessor,
-        HTTPCookieProcessor,
-        HTTPErrorProcessor
-        ]
-    handlers = []
-    replacement_handlers = []
-
-    def __init__(self, klass=_opener.OpenerDirector):
-        self.klass = klass
-
-    def build_opener(self, *handlers):
-        """Create an opener object from a list of handlers and processors.
-
-        The opener will use several default handlers and processors, including
-        support for HTTP and FTP.
-
-        If any of the handlers passed as arguments are subclasses of the
-        default handlers, the default handlers will not be used.
-
-        """
-        opener = self.klass()
-        default_classes = list(self.default_classes)
-        if hasattr(httplib, 'HTTPS'):
-            default_classes.append(HTTPSHandler)
-        skip = []
-        for klass in default_classes:
-            for check in handlers:
-                if type(check) == types.ClassType:
-                    if issubclass(check, klass):
-                        skip.append(klass)
-                elif type(check) == types.InstanceType:
-                    if isinstance(check, klass):
-                        skip.append(klass)
-        for klass in skip:
-            default_classes.remove(klass)
-
-        for klass in default_classes:
-            opener.add_handler(klass())
-        for h in handlers:
-            if type(h) == types.ClassType:
-                h = h()
-            opener.add_handler(h)
-
-        return opener
-
-build_opener = OpenerFactory().build_opener
-
-_opener = None
-urlopen_lock = _threading.Lock()
-def urlopen(url, data=None):
-    global _opener
-    if _opener is None:
-        urlopen_lock.acquire()
-        try:
-            if _opener is None:
-                _opener = build_opener()
-        finally:
-            urlopen_lock.release()
-    return _opener.open(url, data)
-
-def urlretrieve(url, filename=None, reporthook=None, data=None):
-    global _opener
-    if _opener is None:
-        urlopen_lock.acquire()
-        try:
-            if _opener is None:
-                _opener = build_opener()
-        finally:
-            urlopen_lock.release()
-    return _opener.retrieve(url, filename, reporthook, data)
-
-def install_opener(opener):
-    global _opener
-    _opener = opener

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -13,40 +13,31 @@
 
 import sys, warnings, urllib2
 
-from _opener import OpenerDirector
-
+import _opener
 import _urllib2
 import _auth
 import _gzip
+import _response
 
 
-class HTTPRefererProcessor(_urllib2.BaseHandler):
-    def http_request(self, request):
-        # See RFC 2616 14.36.  The only times we know the source of the
-        # request URI has a URI associated with it are redirect, and
-        # Browser.click() / Browser.submit() / Browser.follow_link().
-        # Otherwise, it's the user's job to add any Referer header before
-        # .open()ing.
-        if hasattr(request, "redirect_dict"):
-            request = self.parent._add_referer_header(
-                request, origin_request=False)
-        return request
-
-    https_request = http_request
-
-
-class UserAgent(OpenerDirector):
+class UserAgentBase(_opener.OpenerDirector):
     """Convenient user-agent class.
 
     Do not use .add_handler() to add a handler for something already dealt with
     by this code.
 
+    The only reason at present for the distinction between UserAgent and
+    UserAgentBase is so that classes that depend on .seek()able responses
+    (e.g. mechanize.Browser) can inherit from UserAgentBase.  The subclass
+    UserAgent exposes a .set_seekable_responses() method that allows switching
+    off the adding of a .seek() method to responses.
+
     Public attributes:
 
     addheaders: list of (name, value) pairs specifying headers to send with
      every request, unless they are overridden in the Request instance.
 
-     >>> ua = UserAgent()
+     >>> ua = UserAgentBase()
      >>> ua.addheaders = [
      ...  ("User-agent", "Mozilla/5.0 (compatible)"),
      ...  ("From", "responsible.person at example.com")]
@@ -74,9 +65,7 @@
         "_redirect": _urllib2.HTTPRedirectHandler,
         "_cookies": _urllib2.HTTPCookieProcessor,
         "_refresh": _urllib2.HTTPRefreshProcessor,
-        "_referer": HTTPRefererProcessor,  # from this module, note
         "_equiv": _urllib2.HTTPEquivProcessor,
-        "_seek": _urllib2.SeekableProcessor,
         "_proxy": _urllib2.ProxyHandler,
         "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
         "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
@@ -92,18 +81,18 @@
     default_others = ["_unknown", "_http_error", "_http_request_upgrade",
                       "_http_default_error",
                       ]
-    default_features = ["_redirect", "_cookies", "_referer",
+    default_features = ["_redirect", "_cookies",
                         "_refresh", "_equiv",
                         "_basicauth", "_digestauth",
                         "_proxy", "_proxy_basicauth", "_proxy_digestauth",
-                        "_seek", "_robots",
+                        "_robots",
                         ]
     if hasattr(_urllib2, 'HTTPSHandler'):
         handler_classes["https"] = _urllib2.HTTPSHandler
         default_schemes.append("https")
 
     def __init__(self):
-        OpenerDirector.__init__(self)
+        _opener.OpenerDirector.__init__(self)
 
         ua_handlers = self._ua_handlers = {}
         for scheme in (self.default_schemes+
@@ -116,7 +105,7 @@
 
         # Yuck.
         # Ensure correct default constructor args were passed to
-        # HTTPRefererProcessor and HTTPEquivProcessor.
+        # HTTPRefreshProcessor and HTTPEquivProcessor.
         if "_refresh" in ua_handlers:
             self.set_handle_refresh(True)
         if "_equiv" in ua_handlers:
@@ -130,12 +119,13 @@
             ppm = _auth.HTTPProxyPasswordMgr()
         self.set_password_manager(pm)
         self.set_proxy_password_manager(ppm)
+        # set default certificate manager
+        if "https" in ua_handlers:
+            cm = _urllib2.HTTPSClientCertMgr()
+            self.set_client_cert_manager(cm)
 
-        # special case, requires extra support from mechanize.Browser
-        self._handle_referer = True
-
     def close(self):
-        OpenerDirector.close(self)
+        _opener.OpenerDirector.close(self)
         self._ua_handlers = None
 
     # XXX
@@ -175,10 +165,6 @@
         for scheme in want.keys():
             self._set_handler(scheme, True)
 
-    def _add_referer_header(self, request, origin_request=True):
-        raise NotImplementedError(
-            "this class can't do HTTP Referer: use mechanize.Browser instead")
-
     def set_cookiejar(self, cookiejar):
         """Set a mechanize.CookieJar, or None."""
         self._set_handler("_cookies", obj=cookiejar)
@@ -200,6 +186,25 @@
         self._proxy_password_manager.add_password(
             realm, hostport, user, password)
 
+    def add_client_certificate(self, url, key_file, cert_file):
+        """Add an SSL client certificate, for HTTPS client auth.
+
+        key_file and cert_file must be filenames of the key and certificate
+        files, in PEM format.  You can use e.g. OpenSSL to convert a p12 (PKCS
+        12) file to PEM format:
+
+        openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
+        openssl pkcs12 -nocerts -in cert.p12 -out key.pem
+
+
+        Note that client certificate password input is very inflexible ATM.  At
+        the moment this seems to be console only, which is presumably the
+        default behaviour of libopenssl.  In future mechanize may support
+        third-party libraries that (I assume) allow more options here.
+
+        """
+        self._client_cert_manager.add_key_cert(url, key_file, cert_file)
+
     # the following are rarely useful -- use add_password / add_proxy_password
     # instead
     def set_password_manager(self, password_manager):
@@ -212,6 +217,11 @@
         self._proxy_password_manager = password_manager
         self._set_handler("_proxy_basicauth", obj=password_manager)
         self._set_handler("_proxy_digestauth", obj=password_manager)
+    def set_client_cert_manager(self, cert_manager):
+        """Set a mechanize.HTTPClientCertMgr, or None."""
+        self._client_cert_manager = cert_manager
+        handler = self._ua_handlers["https"]
+        handler.client_cert_manager = cert_manager
 
     # these methods all take a boolean parameter
     def set_handle_robots(self, handle):
@@ -227,7 +237,8 @@
     def set_handle_equiv(self, handle, head_parser_class=None):
         """Set whether to treat HTML http-equiv headers like HTTP headers.
 
-        Response objects will be .seek()able if this is set.
+        Response objects may be .seek()able if this is set (currently returned
+        responses are, raised HTTPError exception responses are not).
 
         """
         if head_parser_class is not None:
@@ -235,16 +246,6 @@
         else:
             constructor_kwds={}
         self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
-    def set_handle_referer(self, handle):
-        """Set whether to add Referer header to each request.
-
-        This base class does not implement this feature (so don't turn this on
-        if you're using this base class directly), but the subclass
-        mechanize.Browser does.
-
-        """
-        self._set_handler("_referer", handle)
-        self._handle_referer = bool(handle)
     def set_handle_gzip(self, handle):
         """Handle gzip transfer encoding.
 
@@ -284,6 +285,9 @@
 
         See docstring for .set_debug_redirects() for details of logging.
 
+        Response objects may be .seek()able if this is set (currently returned
+        responses are, raised HTTPError exception responses are not).
+
         """
         self._set_handler("_debug_response_body", handle)
     def set_debug_http(self, handle):
@@ -321,3 +325,24 @@
         if newhandler is not None:
             self.add_handler(newhandler)
             self._ua_handlers[name] = newhandler
+
+
+class UserAgent(UserAgentBase):
+
+    def __init__(self):
+        UserAgentBase.__init__(self)
+        self._seekable = False
+
+    def set_seekable_responses(self, handle):
+        """Make response objects .seek()able."""
+        self._seekable = bool(handle)
+
+    def open(self, fullurl, data=None):
+        if self._seekable:
+            def bound_open(fullurl, data=None):
+                return UserAgentBase.open(self, fullurl, data)
+            response = _opener.wrapped_open(
+                bound_open, _response.seek_wrapped_response, fullurl, data)
+        else:
+            response = UserAgentBase.open(self, fullurl, data)
+        return response

Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py	2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py	2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,4 +1,4 @@
-"""Python backwards-compat., date/time routines, seekable file object wrapper.
+"""Utility functions and date/time routines.
 
  Copyright 2002-2006 John J Lee <jjl at pobox.com>
 
@@ -8,32 +8,21 @@
 
 """
 
-import re, string, time, copy, urllib, mimetools
-from types import TupleType
-from cStringIO import StringIO
+import re, string, time, warnings
 
-def startswith(string, initial):
-    if len(initial) > len(string): return False
-    return string[:len(initial)] == initial
+def deprecation(message):
+    warnings.warn(message, DeprecationWarning, stacklevel=3)
+def hide_deprecations():
+    warnings.filterwarnings('ignore', category=DeprecationWarning)
+def reset_deprecations():
+    warnings.filterwarnings('default', category=DeprecationWarning)
 
-def endswith(string, final):
-    if len(final) > len(string): return False
-    return string[-len(final):] == final
 
 def isstringlike(x):
     try: x+""
     except: return False
     else: return True
 
-SPACE_DICT = {}
-for c in string.whitespace:
-    SPACE_DICT[c] = None
-del c
-def isspace(string):
-    for c in string:
-        if not SPACE_DICT.has_key(c): return False
-    return True
-
 ## def caller():
 ##     try:
 ##         raise SyntaxError
@@ -42,33 +31,6 @@
 ##     return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
 
 
-# this is here rather than in _HeadersUtil as it's just for
-# compatibility with old Python versions, rather than entirely new code
-def getheaders(msg, name):
-    """Get all values for a header.
-
-    This returns a list of values for headers given more than once; each
-    value in the result list is stripped in the same way as the result of
-    getheader().  If the header is not given, return an empty list.
-    """
-    result = []
-    current = ''
-    have_header = 0
-    for s in msg.getallmatchingheaders(name):
-        if isspace(s[0]):
-            if current:
-                current = "%s\n %s" % (current, string.strip(s))
-            else:
-                current = string.strip(s)
-        else:
-            if have_header:
-                result.append(current)
-            current = string.strip(s[string.find(s, ":") + 1:])
-            have_header = 1
-    if have_header:
-        result.append(current)
-    return result
-
 from calendar import timegm
 
 # Date/time conversion routines for formats used by the HTTP protocol.
@@ -86,7 +48,7 @@
 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
           "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
 months_lower = []
-for month in months: months_lower.append(string.lower(month))
+for month in months: months_lower.append(month.lower())
 
 
 def time2isoz(t=None):
@@ -144,7 +106,7 @@
     # translate month name to number
     # month numbers start with 1 (January)
     try:
-        mon = months_lower.index(string.lower(mon))+1
+        mon = months_lower.index(mon.lower())+1
     except ValueError:
         # maybe it's already a number
         try:
@@ -185,7 +147,7 @@
         # adjust time using timezone string, to get absolute time since epoch
         if tz is None:
             tz = "UTC"
-        tz = string.upper(tz)
+        tz = tz.upper()
         offset = offset_from_tz_string(tz)
         if offset is None:
             return None
@@ -247,7 +209,7 @@
     m = strict_re.search(text)
     if m:
         g = m.groups()
-        mon = months_lower.index(string.lower(g[1])) + 1
+        mon = months_lower.index(g[1].lower()) + 1
         tt = (int(g[2]), mon, int(g[0]),
               int(g[3]), int(g[4]), float(g[5]))
         return my_timegm(tt)
@@ -255,7 +217,7 @@
     # No, we need some messy parsing...
 
     # clean up
-    text = string.lstrip(text)
+    text = text.lstrip()
     text = wkday_re.sub("", text, 1)  # Useless weekday
 
     # tz is time zone specifier string
@@ -300,7 +262,7 @@
 
     """
     # clean up
-    text = string.lstrip(text)
+    text = text.lstrip()
 
     # tz is time zone specifier string
     day, mon, yr, hr, min, sec, tz = [None]*7
@@ -315,340 +277,3 @@
         return None  # bad format
 
     return _str2time(day, mon, yr, hr, min, sec, tz)
-
-
-# XXX Andrew Dalke kindly sent me a similar class in response to my request on
-# comp.lang.python, which I then proceeded to lose.  I wrote this class
-# instead, but I think he's released his code publicly since, could pinch the
-# tests from it, at least...
-
-# For testing seek_wrapper invariant (note that
-# test_urllib2.HandlerTest.test_seekable is expected to fail when this
-# invariant checking is turned on).  The invariant checking is done by module
-# ipdc, which is available here:
-# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
-## from ipdbc import ContractBase
-## class seek_wrapper(ContractBase):
-class seek_wrapper:
-    """Adds a seek method to a file object.
-
-    This is only designed for seeking on readonly file-like objects.
-
-    Wrapped file-like object must have a read method.  The readline method is
-    only supported if that method is present on the wrapped object.  The
-    readlines method is always supported.  xreadlines and iteration are
-    supported only for Python 2.2 and above.
-
-    Public attribute: wrapped (the wrapped file object).
-
-    WARNING: All other attributes of the wrapped object (ie. those that are not
-    one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
-    are passed through unaltered, which may or may not make sense for your
-    particular file object.
-
-    """
-    # General strategy is to check that cache is full enough, then delegate to
-    # the cache (self.__cache, which is a cStringIO.StringIO instance).  A seek
-    # position (self.__pos) is maintained independently of the cache, in order
-    # that a single cache may be shared between multiple seek_wrapper objects.
-    # Copying using module copy shares the cache in this way.
-
-    def __init__(self, wrapped):
-        self.wrapped = wrapped
-        self.__have_readline = hasattr(self.wrapped, "readline")
-        self.__cache = StringIO()
-        self.__pos = 0  # seek position
-
-    def invariant(self):
-        # The end of the cache is always at the same place as the end of the
-        # wrapped file.
-        return self.wrapped.tell() == len(self.__cache.getvalue())
-
-    def __getattr__(self, name):
-        wrapped = self.__dict__.get("wrapped")
-        if wrapped:
-            return getattr(wrapped, name)
-        return getattr(self.__class__, name)
-
-    def seek(self, offset, whence=0):
-        assert whence in [0,1,2]
-
-        # how much data, if any, do we need to read?
-        if whence == 2:  # 2: relative to end of *wrapped* file
-            if offset < 0: raise ValueError("negative seek offset")
-            # since we don't know yet where the end of that file is, we must
-            # read everything
-            to_read = None
-        else:
-            if whence == 0:  # 0: absolute
-                if offset < 0: raise ValueError("negative seek offset")
-                dest = offset
-            else:  # 1: relative to current position
-                pos = self.__pos
-                if pos < offset:
-                    raise ValueError("seek to before start of file")
-                dest = pos + offset
-            end = len(self.__cache.getvalue())
-            to_read = dest - end
-            if to_read < 0:
-                to_read = 0
-
-        if to_read != 0:
-            self.__cache.seek(0, 2)
-            if to_read is None:
-                assert whence == 2
-                self.__cache.write(self.wrapped.read())
-                self.__pos = self.__cache.tell() - offset
-            else:
-                self.__cache.write(self.wrapped.read(to_read))
-                # Don't raise an exception even if we've seek()ed past the end
-                # of .wrapped, since fseek() doesn't complain in that case.
-                # Also like fseek(), pretend we have seek()ed past the end,
-                # i.e. not:
-                #self.__pos = self.__cache.tell()
-                # but rather:
-                self.__pos = dest
-        else:
-            self.__pos = dest
-
-    def tell(self):
-        return self.__pos
-
-    def __copy__(self):
-        cpy = self.__class__(self.wrapped)
-        cpy.__cache = self.__cache
-        return cpy
-
-    def get_data(self):
-        pos = self.__pos
-        try:
-            self.seek(0)
-            return self.read(-1)
-        finally:
-            self.__pos = pos
-
-    def read(self, size=-1):
-        pos = self.__pos
-        end = len(self.__cache.getvalue())
-        available = end - pos
-
-        # enough data already cached?
-        if size <= available and size != -1:
-            self.__cache.seek(pos)
-            self.__pos = pos+size
-            return self.__cache.read(size)
-
-        # no, so read sufficient data from wrapped file and cache it
-        if self.wrapped.read is None:
-            # XXX oops, wrapped file-like-object isn't valid, ignore it
-            return ''
-
-        self.__cache.seek(0, 2)
-        if size == -1:
-            self.__cache.write(self.wrapped.read())
-        else:
-            to_read = size - available
-            assert to_read > 0
-            self.__cache.write(self.wrapped.read(to_read))
-        self.__cache.seek(pos)
-
-        data = self.__cache.read(size)
-        self.__pos = self.__cache.tell()
-        assert self.__pos == pos + len(data)
-        return data
-
-    def readline(self, size=-1):
-        if not self.__have_readline:
-            raise NotImplementedError("no readline method on wrapped object")
-
-        # line we're about to read might not be complete in the cache, so
-        # read another line first
-        pos = self.__pos
-        self.__cache.seek(0, 2)
-        self.__cache.write(self.wrapped.readline())
-        self.__cache.seek(pos)
-
-        data = self.__cache.readline()
-        if size != -1:
-            r = data[:size]
-            self.__pos = pos+size
-        else:
-            r = data
-            self.__pos = pos+len(data)
-        return r
-
-    def readlines(self, sizehint=-1):
-        pos = self.__pos
-        self.__cache.seek(0, 2)
-        self.__cache.write(self.wrapped.read())
-        self.__cache.seek(pos)
-        data = self.__cache.readlines(sizehint)
-        self.__pos = self.__cache.tell()
-        return data
-
-    def __iter__(self): return self
-    def next(self):
-        line = self.readline()
-        if line == "": raise StopIteration
-        return line
-
-    xreadlines = __iter__
-
-    def __repr__(self):
-        return ("<%s at %s whose wrapped object = %r>" %
-                (self.__class__.__name__, hex(id(self)), self.wrapped))
-
-
-class response_seek_wrapper(seek_wrapper):
-
-    """
-    Supports copying response objects and setting response body data.
-
-    """
-
-    def __init__(self, wrapped):
-        seek_wrapper.__init__(self, wrapped)
-        self._headers = self.wrapped.info()
-
-    def __copy__(self):
-        cpy = seek_wrapper.__copy__(self)
-        # copy headers from delegate
-        cpy._headers = copy.copy(self.info())
-        return cpy
-
-    def info(self):
-        return self._headers
-
-    def set_data(self, data):
-        self.seek(0)
-        self.read()
-        self.close()
-        cache = self._seek_wrapper__cache = StringIO()
-        cache.write(data)
-        self.seek(0)
-
-
-class eoffile:
-    # file-like object that always claims to be at end-of-file...
-    def read(self, size=-1): return ""
-    def readline(self, size=-1): return ""
-    def __iter__(self): return self
-    def next(self): return ""
-    def close(self): pass
-
-class eofresponse(eoffile):
-    def __init__(self, url, headers, code, msg):
-        self._url = url
-        self._headers = headers
-        self.code = code
-        self.msg = msg
-    def geturl(self): return self._url
-    def info(self): return self._headers
-
-
-class closeable_response:
-    """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
-
-    Only supports responses returned by mechanize.HTTPHandler.
-
-    After .close(), the following methods are supported:
-
-    .read()
-    .readline()
-    .readlines()
-    .seek()
-    .tell()
-    .info()
-    .geturl()
-    .__iter__()
-    .next()
-    .close()
-
-    and the following attributes are supported:
-
-    .code
-    .msg
-
-    Also supports pickling (but the stdlib currently does something to prevent
-    it: http://python.org/sf/1144636).
-
-    """
-    # presence of this attr indicates is useable after .close()
-    closeable_response = None
-
-    def __init__(self, fp, headers, url, code, msg):
-        self._set_fp(fp)
-        self._headers = headers
-        self._url = url
-        self.code = code
-        self.msg = msg
-
-    def _set_fp(self, fp):
-        self.fp = fp
-        self.read = self.fp.read
-        self.readline = self.fp.readline
-        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
-        if hasattr(self.fp, "fileno"):
-            self.fileno = self.fp.fileno
-        else:
-            self.fileno = lambda: None
-        if hasattr(self.fp, "__iter__"):
-            self.__iter__ = self.fp.__iter__
-            if hasattr(self.fp, "next"):
-                self.next = self.fp.next
-
-    def __repr__(self):
-        return '<%s at %s whose fp = %r>' % (
-            self.__class__.__name__, hex(id(self)), self.fp)
-
-    def info(self):
-        return self._headers
-
-    def geturl(self):
-        return self._url
-
-    def close(self):
-        wrapped = self.fp
-        wrapped.close()
-        new_wrapped = eofresponse(
-            self._url, self._headers, self.code, self.msg)
-        self._set_fp(new_wrapped)
-
-    def __getstate__(self):
-        # There are three obvious options here:
-        # 1. truncate
-        # 2. read to end
-        # 3. close socket, pickle state including read position, then open
-        #    again on unpickle and use Range header
-        # XXXX um, 4. refuse to pickle unless .close()d.  This is better,
-        #  actually ("errors should never pass silently").  Pickling doesn't
-        #  work anyway ATM, because of http://python.org/sf/1144636 so fix
-        #  this later
-
-        # 2 breaks pickle protocol, because one expects the original object
-        # to be left unscathed by pickling.  3 is too complicated and
-        # surprising (and too much work ;-) to happen in a sane __getstate__.
-        # So we do 1.
-
-        state = self.__dict__.copy()
-        new_wrapped = eofresponse(
-            self._url, self._headers, self.code, self.msg)
-        state["wrapped"] = new_wrapped
-        return state
-
-def make_response(data, headers, url, code, msg):
-    """Convenient factory for objects implementing response interface.
-
-    data: string containing response body data
-    headers: sequence of (name, value) pairs
-    url: URL of response
-    code: integer response code (e.g. 200)
-    msg: string response code message (e.g. "OK")
-
-    """
-    hdr_text = []
-    for name_value in headers:
-        hdr_text.append("%s: %s" % name_value)
-    mime_headers = mimetools.Message(StringIO("\n".join(hdr_text)))
-    r = closeable_response(StringIO(data), mime_headers, url, code, msg)
-    return response_seek_wrapper(r)



More information about the Zope3-Checkins mailing list