[Zope3-checkins] CVS: Zope3/src/zope/textindex - __init__.py:1.2 baseindex.py:1.2 cosineindex.py:1.2 htmlsplitter.py:1.2 iindex.py:1.2 ilexicon.py:1.2 inbest.py:1.2 ipipelineelement.py:1.2 ipipelineelementfactory.py:1.2 iqueryparser.py:1.2 iqueryparsetree.py:1.2 isplitter.py:1.2 lexicon.py:1.2 nbest.py:1.2 okapiindex.py:1.2 parsetree.py:1.2 pipelinefactory.py:1.2 queryparser.py:1.2 ricecode.py:1.2 setops.py:1.2 stopdict.py:1.2 textindexinterfaces.py:1.2 textindexwrapper.py:1.2 widcode.py:1.2

Jim Fulton jim@zope.com
Wed, 25 Dec 2002 09:16:07 -0500


Update of /cvs-repository/Zope3/src/zope/textindex
In directory cvs.zope.org:/tmp/cvs-serv20790/src/zope/textindex

Added Files:
	__init__.py baseindex.py cosineindex.py htmlsplitter.py 
	iindex.py ilexicon.py inbest.py ipipelineelement.py 
	ipipelineelementfactory.py iqueryparser.py iqueryparsetree.py 
	isplitter.py lexicon.py nbest.py okapiindex.py parsetree.py 
	pipelinefactory.py queryparser.py ricecode.py setops.py 
	stopdict.py textindexinterfaces.py textindexwrapper.py 
	widcode.py 
Log Message:
Grand renaming:

- Renamed most files (especially python modules) to lower case.

- Moved views and interfaces into separate hierarchies within each
  project, where each top-level directory under the zope package
  is a separate project.

- Moved everything to src from lib/python.

  lib/python will eventually go away. I need access to the cvs
  repository to make this happen, however.

There are probably some bits that are broken. All tests pass
and zope runs, but I haven't tried everything. There are a number
of cleanups I'll work on tomorrow.



=== Zope3/src/zope/textindex/__init__.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/__init__.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,2 @@
+#
+# This file is necessary to make this directory a package.


=== Zope3/src/zope/textindex/baseindex.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/baseindex.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,314 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Abstract base class for full text index with relevance ranking."""
+
+
+import math
+
+import zodb
+
+from persistence import Persistent
+
+from zodb.btrees.IOBTree import IOBTree
+from zodb.btrees.IIBTree import IIBTree, IIBucket, IITreeSet
+from zodb.btrees.IIBTree import intersection, difference
+from zodb.btrees import Length
+
+from zope.textindex.iindex import IIndex
+from zope.textindex import widcode
+from zope.textindex.setops import mass_weightedIntersection, \
+                                  mass_weightedUnion
+
+
+# Instead of storing floats, we generally store scaled ints.  Binary pickles
+# can store those more efficiently.  The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+    # We expect only positive inputs, so "add a half and chop" is the
+    # same as round().  Surprising, calling round() is significantly more
+    # expensive.
+    return int(f * scale + 0.5)
+
+def unique(L):
+    """Return a list of the unique elements in L."""
+    return IITreeSet(L).keys()
+
+class BaseIndex(Persistent):
+
+    __implements__ = IIndex
+
+    def __init__(self, lexicon):
+        self._lexicon = lexicon
+
+        # wid -> {docid -> weight}; t -> D -> w(D, t)
+        # Different indexers have different notions of term weight, but we
+        # expect each indexer to use ._wordinfo to map wids to its notion
+        # of a docid-to-weight map.
+        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
+        # and it's possible that the lexicon will return a non-zero wid
+        # for a word we don't currently know about.  For example, if we
+        # unindex the last doc containing a particular word, that wid
+        # remains in the lexicon, but is no longer in our _wordinfo map;
+        # lexicons can also be shared across indices, and some other index
+        # may introduce a lexicon word we've never seen.
+        # A word is in-vocabulary for this index if and only if
+        # _wordinfo.has_key(wid).  Note that wid 0 must not be a key.
+        self._wordinfo = IOBTree()
+
+        # docid -> weight
+        # Different indexers have different notions of doc weight, but we
+        # expect each indexer to use ._docweight to map docids to its
+        # notion of what a doc weight is.
+        self._docweight = IIBTree()
+
+        # docid -> WidCode'd list of wids
+        # Used for un-indexing, and for phrase search.
+        self._docwords = IOBTree()
+
+        # Use a BTree length for efficient length computation w/o conflicts
+        self.wordCount = Length.Length()
+
+    def wordCount(self):
+        """Return the number of words in the index."""
+        # This is overridden per instance
+        return len(self._wordinfo)
+
+    def documentCount(self):
+        """Return the number of documents in the index."""
+        return len(self._docweight)
+
+    def get_words(self, docid):
+        """Return a list of the wordids for a given docid."""
+        # Note this is overridden in the instance
+        return widcode.decode(self._docwords[docid])
+
+    # A subclass may wish to extend or override this.
+    def index_doc(self, docid, text):
+        if self._docwords.has_key(docid):
+            return self._reindex_doc(docid, text)
+        wids = self._lexicon.sourceToWordIds(text)
+        wid2weight, docweight = self._get_frequencies(wids)
+        self._mass_add_wordinfo(wid2weight, docid)
+        self._docweight[docid] = docweight
+        self._docwords[docid] = widcode.encode(wids)
+        return len(wids)
+
+    # A subclass may wish to extend or override this.  This is for adjusting
+    # to a new version of a doc that already exists.  The goal is to be
+    # faster than simply unindexing the old version in its entirety and then
+    # adding the new version in its entirety.
+    def _reindex_doc(self, docid, text):
+        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
+        old_wids = self.get_words(docid)
+        old_wid2w, old_docw = self._get_frequencies(old_wids)
+
+        new_wids = self._lexicon.sourceToWordIds(text)
+        new_wid2w, new_docw = self._get_frequencies(new_wids)
+
+        old_widset = IITreeSet(old_wid2w.keys())
+        new_widset = IITreeSet(new_wid2w.keys())
+
+        in_both_widset = intersection(old_widset, new_widset)
+        only_old_widset = difference(old_widset, in_both_widset)
+        only_new_widset = difference(new_widset, in_both_widset)
+        del old_widset, new_widset
+
+        for wid in only_old_widset.keys():
+            self._del_wordinfo(wid, docid)
+
+        for wid in only_new_widset.keys():
+            self._add_wordinfo(wid, new_wid2w[wid], docid)
+
+        for wid in in_both_widset.keys():
+            # For the Okapi indexer, the "if" will trigger only for words
+            # whose counts have changed.  For the cosine indexer, the "if"
+            # may trigger for every wid, since W(d) probably changed and
+            # W(d) is divided into every score.
+            newscore = new_wid2w[wid]
+            if old_wid2w[wid] != newscore:
+                self._add_wordinfo(wid, newscore, docid)
+
+        self._docweight[docid] = new_docw
+        self._docwords[docid] = widcode.encode(new_wids)
+        return len(new_wids)
+
+    # Subclass must override.
+    def _get_frequencies(self, wids):
+        # Compute term frequencies and a doc weight, whatever those mean
+        # to an indexer.
+        # Return pair:
+        #    {wid0: w(d, wid0), wid1: w(d, wid1),  ...],
+        #    docweight
+        # The wid->weight mappings are fed into _add_wordinfo, and docweight
+        # becomes the value of _docweight[docid].
+        raise NotImplementedError
+
+    def has_doc(self, docid):
+        return self._docwords.has_key(docid)
+
+    # A subclass may wish to extend or override this.
+    def unindex_doc(self, docid):
+        for wid in unique(self.get_words(docid)):
+            self._del_wordinfo(wid, docid)
+        del self._docwords[docid]
+        del self._docweight[docid]
+
+    def search(self, term):
+        wids = self._lexicon.termToWordIds(term)
+        if not wids:
+            return None # All docs match
+        wids = self._remove_oov_wids(wids)
+        return mass_weightedUnion(self._search_wids(wids))
+
+    def search_glob(self, pattern):
+        wids = self._lexicon.globToWordIds(pattern)
+        wids = self._remove_oov_wids(wids)
+        return mass_weightedUnion(self._search_wids(wids))
+
+    def search_phrase(self, phrase):
+        wids = self._lexicon.termToWordIds(phrase)
+        cleaned_wids = self._remove_oov_wids(wids)
+        if len(wids) != len(cleaned_wids):
+            # At least one wid was OOV:  can't possibly find it.
+            return IIBTree()
+        scores = self._search_wids(wids)
+        hits = mass_weightedIntersection(scores)
+        if not hits:
+            return hits
+        code = widcode.encode(wids)
+        result = IIBTree()
+        for docid, weight in hits.items():
+            docwords = self._docwords[docid]
+            if docwords.find(code) >= 0:
+                result[docid] = weight
+        return result
+
+    def _remove_oov_wids(self, wids):
+        return filter(self._wordinfo.has_key, wids)
+
+    # Subclass must override.
+    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
+    # for each wid t in wids.  The IIBucket, times the weight, maps D to
+    # TF(D,t) * IDF(t) for every docid D containing t.  wids must not
+    # contain any OOV words.
+    def _search_wids(self, wids):
+        raise NotImplementedError
+
+    # Subclass must override.
+    # It's not clear what it should do.  It must return an upper bound on
+    # document scores for the query.  It would be nice if a document score
+    # divided by the query's query_weight gave the proabability that a
+    # document was relevant, but nobody knows how to do that.  For
+    # CosineIndex, the ratio is the cosine of the angle between the document
+    # and query vectors.  For OkapiIndex, the ratio is a (probably
+    # unachievable) upper bound with no "intuitive meaning" beyond that.
+    def query_weight(self, terms):
+        raise NotImplementedError
+
+    DICT_CUTOFF = 10
+
+    def _add_wordinfo(self, wid, f, docid):
+        # Store a wordinfo in a dict as long as there are less than
+        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
+
+        # The pickle of a dict is smaller than the pickle of an
+        # IIBTree, substantially so for small mappings.  Thus, we use
+        # a dictionary until the mapping reaches DICT_CUTOFF elements.
+
+        # The cutoff is chosen based on the implementation
+        # characteristics of Python dictionaries.  The dict hashtable
+        # always has 2**N slots and is resized whenever it is 2/3s
+        # full.  A pickled dict with 10 elts is half the size of an
+        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
+        # choose 10 as the cutoff for now.
+
+        # The IIBTree has a smaller in-memory representation than a
+        # dictionary, so pickle size isn't the only consideration when
+        # choosing the threshold.  The pickle of a 500-elt dict is 92%
+        # of the size of the same IIBTree, but the dict uses more
+        # space when it is live in memory.  An IIBTree stores two C
+        # arrays of ints, one for the keys and one for the values.  It
+        # holds up to 120 key-value pairs in a single bucket.
+        doc2score = self._wordinfo.get(wid)
+        if doc2score is None:
+            doc2score = {}
+            self.wordCount.change(1)
+        else:
+            # _add_wordinfo() is called for each update.  If the map
+            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+            # Obscure:  First check the type.  If it's not a dict, it
+            # can't need conversion, and then we can avoid an expensive
+            # len(IIBTree).
+            if (isinstance(doc2score, type({})) and
+                len(doc2score) == self.DICT_CUTOFF):
+                doc2score = IIBTree(doc2score)
+        doc2score[docid] = f
+        self._wordinfo[wid] = doc2score # not redundant:  Persistency!
+
+    #    self._mass_add_wordinfo(wid2weight, docid)
+    #
+    # is the same as
+    #
+    #    for wid, weight in wid2weight.items():
+    #        self._add_wordinfo(wid, weight, docid)
+    #
+    # except that _mass_add_wordinfo doesn't require so many function calls.
+    def _mass_add_wordinfo(self, wid2weight, docid):
+        dicttype = type({})
+        get_doc2score = self._wordinfo.get
+        new_word_count = 0
+        for wid, weight in wid2weight.items():
+            doc2score = get_doc2score(wid)
+            if doc2score is None:
+                doc2score = {}
+                new_word_count += 1
+            elif (isinstance(doc2score, dicttype) and
+                  len(doc2score) == self.DICT_CUTOFF):
+                doc2score = IIBTree(doc2score)
+            doc2score[docid] = weight
+            self._wordinfo[wid] = doc2score # not redundant:  Persistency!
+        self.wordCount.change(new_word_count)
+
+
+    def _del_wordinfo(self, wid, docid):
+        doc2score = self._wordinfo[wid]
+        del doc2score[docid]
+        numdocs = len(doc2score)
+        if numdocs == 0:
+            del self._wordinfo[wid]
+            self.wordCount.change(-1)
+            return
+        if numdocs == self.DICT_CUTOFF:
+            new = {}
+            for k, v in doc2score.items():
+                new[k] = v
+            doc2score = new
+        self._wordinfo[wid] = doc2score # not redundant:  Persistency!
+
+def inverse_doc_frequency(term_count, num_items):
+    """Return the inverse doc frequency for a term,
+
+    that appears in term_count items in a collection with num_items
+    total items.
+    """
+    # implements IDF(q, t) = log(1 + N/f(t))
+    return math.log(1.0 + float(num_items) / term_count)


=== Zope3/src/zope/textindex/cosineindex.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/cosineindex.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,136 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking, using a cosine measure."""
+
+import math
+
+from zodb.btrees.IIBTree import IIBucket
+
+from zope.textindex.iindex import IIndex
+from zope.textindex.baseindex import BaseIndex, \
+                                     inverse_doc_frequency, \
+                                     scaled_int, SCALE_FACTOR
+
+class CosineIndex(BaseIndex):
+
+    __implements__ = IIndex
+
+    def __init__(self, lexicon):
+        BaseIndex.__init__(self, lexicon)
+
+        # ._wordinfo for cosine is wid -> {docid -> weight};
+        # t -> D -> w(d, t)/W(d)
+
+        # ._docweight for cosine is
+        # docid -> W(docid)
+
+    # Most of the computation for computing a relevance score for the
+    # document occurs in the _search_wids() method.  The code currently
+    # implements the cosine similarity function described in Managing
+    # Gigabytes, eq. 4.3, p. 187.  The index_object() method
+    # precomputes some values that are independent of the particular
+    # query.
+
+    # The equation is
+    #
+    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
+    #     cosine(d, q) =  -------------------------------------
+    #                                  W(d) * W(q)
+    #
+    # where
+    #    I(d, q) = the intersection of the terms in d and q.
+    #
+    #    w(d, t) = 1 + log f(d, t)
+    #        computed by doc_term_weight(); for a given word t,
+    #        self._wordinfo[t] is a map from d to w(d, t).
+    #
+    #    w(q, t) = log(1 + N/f(t))
+    #        computed by inverse_doc_frequency()
+    #
+    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
+    #        computed by _get_frequencies(), and remembered in
+    #        self._docweight[d]
+    #
+    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
+    #        computed by self.query_weight()
+
+    def _search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(len(self._docweight))
+        L = []
+        DictType = type({})
+        for wid in wids:
+            assert self._wordinfo.has_key(wid)  # caller responsible for OOV
+            d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
+            idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
+            #print "idf = %.3f" % idf
+            if isinstance(d2w, DictType):
+                d2w = IIBucket(d2w)
+            L.append((d2w, scaled_int(idf)))
+        return L
+
+    def query_weight(self, terms):
+        wids = []
+        for term in terms:
+            wids += self._lexicon.termToWordIds(term)
+        N = float(len(self._docweight))
+        sum = 0.0
+        for wid in self._remove_oov_wids(wids):
+            wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
+            sum += wt ** 2.0
+        return scaled_int(math.sqrt(sum))
+
+    def _get_frequencies(self, wids):
+        d = {}
+        dget = d.get
+        for wid in wids:
+            d[wid] = dget(wid, 0) + 1
+        Wsquares = 0.0
+        for wid, count in d.items():
+            w = doc_term_weight(count)
+            Wsquares += w * w
+            d[wid] = w
+        W = math.sqrt(Wsquares)
+        #print "W = %.3f" % W
+        for wid, weight in d.items():
+            #print i, ":", "%.3f" % weight,
+            d[wid] = scaled_int(weight / W)
+            #print "->", d[wid]
+        return d, scaled_int(W)
+
+    # The rest are helper methods to support unit tests
+
+    def _get_wdt(self, d, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
+
+    def _get_Wd(self, d):
+        return self._docweight[d]
+
+    def _get_ft(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        return len(self._wordinfo[wid])
+
+    def _get_wt(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
+
+def doc_term_weight(count):
+    """Return the doc-term weight for a term that appears count times."""
+    # implements w(d, t) = 1 + log f(d, t)
+    return 1.0 + math.log(count)


=== Zope3/src/zope/textindex/htmlsplitter.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/htmlsplitter.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,55 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+import re
+
+from zope.textindex.isplitter import ISplitter
+from zope.textindex.pipelinefactory import element_factory
+
+
+class HTMLWordSplitter:
+
+    __implements__ = ISplitter
+
+    def process(self, text, wordpat=r"(?L)\w+"):
+        splat = []
+        for t in text:
+            splat += self._split(t, wordpat)
+        return splat
+
+    def processGlob(self, text):
+        # see Lexicon.globToWordIds()
+        return self.process(text, r"(?L)\w+[\w*?]*")
+
+    def _split(self, text, wordpat):
+        text = text.lower()
+        remove = [r"<[^<>]*>",
+                  r"&[A-Za-z]+;"]
+        for pat in remove:
+            text = re.sub(pat, " ", text)
+        return re.findall(wordpat, text)
+
+element_factory.registerFactory('Word Splitter',
+                                'HTML aware splitter',
+                                HTMLWordSplitter)
+
+if __name__ == "__main__":
+    import sys
+    splitter = HTMLWordSplitter()
+    for path in sys.argv[1:]:
+        f = open(path, "rb")
+        buf = f.read()
+        f.close()
+        print path
+        print splitter.process([buf])


=== Zope3/src/zope/textindex/iindex.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/iindex.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,74 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Index Interface."""
+
+from zope.interface import Interface
+
+class IIndex(Interface):
+    """Interface for an Index."""
+
+    def wordCount():
+        """Return the number of words in the index."""
+
+    def documentCount():
+        """Return the number of documents in the index."""
+
+    def get_words(docid):
+        """Return a list of wordids for the given docid."""
+
+    def search(term):
+        """Execute a search on a single term given as a string.
+
+        Return an IIBTree mapping docid to score, or None if all docs
+        match due to the lexicon returning no wids for the term (e.g.,
+        if the term is entirely composed of stopwords).
+        """
+
+    def search_phrase(phrase):
+        """Execute a search on a phrase given as a string.
+
+        Return an IIBtree mapping docid to score.
+        """
+
+    def search_glob(pattern):
+        """Execute a pattern search.
+
+        The pattern represents a set of words by using * and ?.  For
+        example, "foo*" represents the set of all words in the lexicon
+        starting with "foo".
+
+        Return an IIBTree mapping docid to score.
+        """
+
+    def query_weight(terms):
+        """Return the weight for a set of query terms.
+
+        'terms' is a sequence of all terms included in the query,
+        although not terms with a not.  If a term appears more than
+        once in a query, it should appear more than once in terms.
+
+        Nothing is defined about what "weight" means, beyond that the
+        result is an upper bound on document scores returned for the
+        query.
+        """
+
+    def index_doc(docid, text):
+        "XXX"
+
+    def unindex_doc(docid):
+        "XXX"
+
+    def has_doc(docid):
+        """Returns true if docid is an id of a document in the index"""


=== Zope3/src/zope/textindex/ilexicon.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/ilexicon.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,75 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class ILexicon(Interface):
+    """Object responsible for converting text to word identifiers."""
+
+    def termToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parse the text as if they are search terms, and skips words
+        that aren't in the lexicon.
+        """
+
+    def sourceToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parse the text as if they come from a source document, and
+        creates new word ids for words that aren't (yet) in the
+        lexicon.
+        """
+
+    def globToWordIds(pattern):
+        """Return a sequence of ids of words matching the pattern.
+
+        The argument should be a single word using globbing syntax,
+        e.g. 'foo*' meaning anything starting with 'foo'.
+
+        Return the wids for all words in the lexicon that match the
+        pattern.
+        """
+
+    def wordCount():
+        """Return the number of unique terms in the lexicon."""
+
+    def get_word(wid):
+        """Return the word for the given word id.
+
+        Raise KeyError if the word id is not in the lexicon.
+        """
+
+    def get_wid(word):
+        """Return the wird id for the given word.
+
+        Return 0 of the word is not in the lexicon.
+        """
+
+    def parseTerms(text):
+        """Pass the text through the pipeline.
+
+        Return a list of words, normalized by the pipeline
+        (e.g. stopwords removed, case normalized etc.).
+        """
+
+    def isGlob(word):
+        """Return true if the word is a globbing pattern.
+
+        The word should be one of the words returned by parseTerm().
+        """


=== Zope3/src/zope/textindex/inbest.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/inbest.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,73 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""NBest Interface.
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method.  If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+
+from zope.interface import Interface
+
+class INBest(Interface):
+    """Interface for an N-Best chooser."""
+
+    def add(item, score):
+        """Record that item 'item' has score 'score'.  No return value.
+
+        The N best-scoring items are remembered, where N was passed to
+        the constructor.  'item' can by anything.  'score' should be
+        a number, and larger numbers are considered better.
+        """
+
+    def addmany(sequence):
+        """Like "for item, score in sequence: self.add(item, score)".
+
+        This is simply faster than calling add() len(seq) times.
+        """
+
+    def getbest():
+        """Return the (at most) N best-scoring items as a sequence.
+
+        The return value is a sequence of 2-tuples, (item, score), with
+        the largest score first.  If .add() has been called fewer than
+        N times, this sequence will contain fewer than N pairs.
+        """
+
+    def pop_smallest():
+        """Return and remove the (item, score) pair with lowest score.
+
+        If len(self) is 0, raise IndexError.
+
+        To be cleaer, this is the lowest score among the N best-scoring
+        seen so far.  This is most useful if the capacity of the NBest
+        object is never exceeded, in which case  pop_smallest() allows
+        using the object as an ordinary smallest-in-first-out priority
+        queue.
+        """
+
+    def __len__():
+        """Return the number of (item, score) pairs currently known.
+
+        This is N (the value passed to the constructor), unless .add()
+        has been called fewer than N times.
+        """
+
+    def capacity():
+        """Return the maximum number of (item, score) pairs.
+
+        This is N (the value passed to the constructor).
+        """


=== Zope3/src/zope/textindex/ipipelineelement.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/ipipelineelement.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,29 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IPipelineElement(Interface):
+
+    def process(source):
+        """Provide a text processing step.
+
+        Process a source sequence of words into a result sequence.
+        """
+
+    def processGlob(source):
+        """Process, passing through globbing metacharaters.
+
+        This is an optional method; if it is not used, process() is used.
+        """


=== Zope3/src/zope/textindex/ipipelineelementfactory.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/ipipelineelementfactory.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,39 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IPipelineElementFactory(Interface):
+    """Class for creating pipeline elements by name"""
+
+    def registerFactory(group, name, factory):
+        """Registers a pipeline factory by name and element group.
+
+        Each name can be registered only once for a given group. Duplicate
+        registrations will raise a ValueError
+        """
+
+    def getFactoryGroups():
+        """Returns a sorted list of element group names
+        """
+
+    def getFactoryNames(group):
+        """Returns a sorted list of registered pipeline factory names
+        in the specified element group
+        """
+
+    def instantiate(group, name):
+        """Instantiates a pipeline element by group and name. If name is not
+        registered raise a KeyError.
+        """


=== Zope3/src/zope/textindex/iqueryparser.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:05 2002
+++ Zope3/src/zope/textindex/iqueryparser.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,53 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser Interface."""
+
+from zope.interface import Interface
+
+class IQueryParser(Interface):
+    """Interface for Query Parsers."""
+
+    def parseQuery(query):
+        """Parse a query string.
+
+        Return a parse tree (which implements IQueryParseTree).
+
+        Some of the query terms may be ignored because they are
+        stopwords; use getIgnored() to find out which terms were
+        ignored.  But if the entire query consists only of stop words,
+        or of stopwords and one or more negated terms, an exception is
+        raised.
+
+        May raise ParseTree.ParseError.
+        """
+
+    def getIgnored():
+        """Return the list of ignored terms.
+
+        Return the list of terms that were ignored by the most recent
+        call to parseQuery() because they were stopwords.
+
+        If parseQuery() was never called this returns None.
+        """
+
+    def parseQueryEx(query):
+        """Parse a query string.
+
+        Return a tuple (tree, ignored) where 'tree' is the parse tree
+        as returned by parseQuery(), and 'ignored' is a list of
+        ignored terms as returned by getIgnored().
+
+        May raise ParseTree.ParseError.
+        """


=== Zope3/src/zope/textindex/iqueryparsetree.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/iqueryparsetree.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,52 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser Tree Interface."""
+
+from zope.interface import Interface
+
+class IQueryParseTree(Interface):
+    """Interface for parse trees returned by parseQuery()."""
+
+    def nodeType():
+        """Return the node type.
+
+        This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
+        """
+
+    def getValue():
+        """Return a node-type specific value.
+
+        For node type:    Return:
+        'AND'             a list of parse trees
+        'OR'              a list of parse trees
+        'NOT'             a parse tree
+        'ATOM'            a string (representing a single search term)
+        'PHRASE'          a string (representing a search phrase)
+        'GLOB'            a string (representing a pattern, e.g. "foo*")
+        """
+
+    def terms():
+        """Return a list of all terms in this node, excluding NOT subtrees."""
+
+    def executeQuery(index):
+        """Execute the query represented by this node against the index.
+
+        The index argument must implement the IIndex interface.
+
+        Return an IIBucket or IIBTree mapping document ids to scores
+        (higher scores mean better results).
+
+        May raise ParseTree.QueryError.
+        """


=== Zope3/src/zope/textindex/isplitter.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/isplitter.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,21 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class ISplitter(Interface):
+    """A splitter."""
+
+    def process(text):
+        """Run the splitter over the input text, returning a list of terms."""


=== Zope3/src/zope/textindex/lexicon.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/lexicon.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,219 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+import re
+
+import zodb
+
+from zodb.btrees.IOBTree import IOBTree
+from zodb.btrees.OIBTree import OIBTree
+
+from persistence import Persistent
+
+from zope.textindex.ilexicon import ILexicon
+from zope.textindex.stopdict import get_stopdict
+from zope.textindex.parsetree import QueryError
+from zope.textindex.pipelinefactory import element_factory
+
+
+class Lexicon(Persistent):
+
+    __implements__ = ILexicon
+
+    def __init__(self, *pipeline):
+        self._wids = OIBTree()  # word -> wid
+        self._words = IOBTree() # wid -> word
+        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
+        # of vocabulary).  This can happen, e.g., if a query contains a word
+        # we never saw before, and that isn't a known stopword (or otherwise
+        # filtered out).  Returning a special wid value for OOV words is a
+        # way to let clients know when an OOV word appears.
+        self._nextwid = 1
+        self._pipeline = pipeline
+
+        # Keep some statistics about indexing
+        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
+        self._nwords = 0 # Number of words indexed (after pipeline)
+
+    def wordCount(self):
+        """Return the number of unique terms in the lexicon."""
+        return self._nextwid - 1
+
+    def words(self):
+        return self._wids.keys()
+
+    def wids(self):
+        return self._words.keys()
+
+    def items(self):
+        return self._wids.items()
+
+    def sourceToWordIds(self, text):
+        last = _text2list(text)
+        for t in last:
+            self._nbytes += len(t)
+        for element in self._pipeline:
+            last = element.process(last)
+        self._nwords += len(last)
+        return map(self._getWordIdCreate, last)
+
+    def termToWordIds(self, text):
+        last = _text2list(text)
+        for element in self._pipeline:
+            last = element.process(last)
+        wids = []
+        for word in last:
+            wids.append(self._wids.get(word, 0))
+        return wids
+
+    def parseTerms(self, text):
+        last = _text2list(text)
+        for element in self._pipeline:
+            process = getattr(element, "processGlob", element.process)
+            last = process(last)
+        return last
+
+    def isGlob(self, word):
+        return "*" in word or "?" in word
+
+    def get_word(self, wid):
+        return self._words[wid]
+
+    def get_wid(self, word):
+        return self._wids.get(word, 0)
+
+    def globToWordIds(self, pattern):
+        # Implement * and ? just as in the shell, except the pattern
+        # must not start with either of these
+        prefix = ""
+        while pattern and pattern[0] not in "*?":
+            prefix += pattern[0]
+            pattern = pattern[1:]
+        if not pattern:
+            # There were no globbing characters in the pattern
+            wid = self._wids.get(prefix, 0)
+            if wid:
+                return [wid]
+            else:
+                return []
+        if not prefix:
+            # The pattern starts with a globbing character.
+            # This is too efficient, so we raise an exception.
+            raise QueryError(
+                "pattern %r shouldn't start with glob character" % pattern)
+        pat = prefix
+        for c in pattern:
+            if c == "*":
+                pat += ".*"
+            elif c == "?":
+                pat += "."
+            else:
+                pat += re.escape(c)
+        pat += "$"
+        prog = re.compile(pat)
+        keys = self._wids.keys(prefix) # Keys starting at prefix
+        wids = []
+        for key in keys:
+            if not key.startswith(prefix):
+                break
+            if prog.match(key):
+                wids.append(self._wids[key])
+        return wids
+
+    def _getWordIdCreate(self, word):
+        wid = self._wids.get(word)
+        if wid is None:
+            wid = self._new_wid()
+            self._wids[word] = wid
+            self._words[wid] = word
+        return wid
+
+    def _new_wid(self):
+        wid = self._nextwid
+        self._nextwid += 1
+        return wid
+
+def _text2list(text):
+    # Helper: splitter input may be a string or a list of strings
+    try:
+        text + u""
+    except:
+        return text
+    else:
+        return [text]
+
+# Sample pipeline elements
+
+class Splitter:
+
+    import re
+    rx = re.compile(r"(?u)\w+")
+    rxGlob = re.compile(r"(?u)\w+[\w*?]*") # See globToWordIds() above
+
+    def process(self, lst):
+        result = []
+        for s in lst:
+            result += self.rx.findall(s)
+        return result
+
+    def processGlob(self, lst):
+        result = []
+        for s in lst:
+            result += self.rxGlob.findall(s)
+        return result
+
+element_factory.registerFactory('Word Splitter',
+                                 'Whitespace splitter',
+                                 Splitter)
+
+class CaseNormalizer:
+
+    def process(self, lst):
+        return [w.lower() for w in lst]
+
+element_factory.registerFactory('Case Normalizer',
+                                'Case Normalizer',
+                                CaseNormalizer)
+
+element_factory.registerFactory('Stop Words',
+                                ' Don\'t remove stop words',
+                                None)
+
+class StopWordRemover:
+
+    dict = get_stopdict().copy()
+
+    try:
+        from zope.textindex.stopper import process as _process
+    except ImportError:
+        def process(self, lst):
+            has_key = self.dict.has_key
+            return [w for w in lst if not has_key(w)]
+    else:
+        def process(self, lst):
+            return self._process(self.dict, lst)
+
+element_factory.registerFactory('Stop Words',
+                                'Remove listed stop words only',
+                                StopWordRemover)
+
+class StopWordAndSingleCharRemover(StopWordRemover):
+
+    dict = get_stopdict().copy()
+    for c in range(255):
+        dict[chr(c)] = None
+
+element_factory.registerFactory('Stop Words',
+                                'Remove listed and single char words',
+                                StopWordAndSingleCharRemover)


=== Zope3/src/zope/textindex/nbest.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/nbest.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,76 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""NBest
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method.  If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+from bisect import bisect_left as bisect
+
+from zope.textindex.inbest import INBest
+
+class NBest:
+    __implements__ = INBest
+
+    def __init__(self, N):
+        "Build an NBest object to remember the N best-scoring objects."
+
+        if N < 1:
+            raise ValueError("NBest() argument must be at least 1")
+        self._capacity = N
+
+        # This does a very simple thing with sorted lists.  For large
+        # N, a min-heap can be unboundedly better in terms of data
+        # movement time.
+        self._scores = []
+        self._items = []
+
+    def __len__(self):
+        return len(self._scores)
+
+    def capacity(self):
+        return self._capacity
+
+    def add(self, item, score):
+        self.addmany([(item, score)])
+
+    def addmany(self, sequence):
+        scores, items, capacity = self._scores, self._items, self._capacity
+        n = len(scores)
+        for item, score in sequence:
+            # When we're in steady-state, the usual case is that we're filled
+            # to capacity, and that an incoming item is worse than any of
+            # the best-seen so far.
+            if n >= capacity and score <= scores[0]:
+                continue
+            i = bisect(scores, score)
+            scores.insert(i, score)
+            items.insert(i, item)
+            if n == capacity:
+                del items[0], scores[0]
+            else:
+                n += 1
+        assert n == len(scores)
+
+    def getbest(self):
+        result = zip(self._items, self._scores)
+        result.reverse()
+        return result
+
+    def pop_smallest(self):
+        if self._scores:
+            return self._items.pop(0), self._scores.pop(0)
+        raise IndexError("pop_smallest() called on empty NBest object")


=== Zope3/src/zope/textindex/okapiindex.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/okapiindex.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,343 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking, using an Okapi BM25 rank."""
+
+# Lots of comments are at the bottom of this file.  Read them to
+# understand what's going on.
+
+from zodb.btrees.IIBTree import IIBucket
+
+from zope.textindex.iindex import IIndex
+from zope.textindex.baseindex import \
+     BaseIndex, inverse_doc_frequency, scaled_int
+
+class OkapiIndex(BaseIndex):
+
+    __implements__ = IIndex
+
+    # BM25 free parameters.
+    K1 = 1.2
+    B  = 0.75
+    assert K1 >= 0.0
+    assert 0.0 <= B <= 1.0
+
+    def __init__(self, lexicon):
+        BaseIndex.__init__(self, lexicon)
+
+        # ._wordinfo for Okapi is
+        # wid -> {docid -> frequency}; t -> D -> f(D, t)
+
+        # ._docweight for Okapi is
+        # docid -> # of words in the doc
+        # This is just len(self._docwords[docid]), but _docwords is stored
+        # in compressed form, so uncompressing it just to count the list
+        # length would be ridiculously expensive.
+
+        # sum(self._docweight.values()), the total # of words in all docs
+        # This is a long for "better safe than sorry" reasons.  It isn't
+        # used often enough that speed should matter.
+        self._totaldoclen = 0L
+
+    def index_doc(self, docid, text):
+        count = BaseIndex.index_doc(self, docid, text)
+        self._totaldoclen += count
+        return count
+
+    def _reindex_doc(self, docid, text):
+        self._totaldoclen -= self._docweight[docid]
+        return BaseIndex._reindex_doc(self, docid, text)
+
+    def unindex_doc(self, docid):
+        self._totaldoclen -= self._docweight[docid]
+        BaseIndex.unindex_doc(self, docid)
+
+    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
+    # for each wid t in wids.  The IIBucket, times the weight, maps D to
+    # TF(D,t) * IDF(t) for every docid D containing t.
+    # As currently written, the weights are always 1, and the IIBucket maps
+    # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
+    # but stored as a scaled_int.
+    # NOTE:  This may be overridden below, by a function that computes the
+    # same thing but with the inner scoring loop in C.
+    def _search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(len(self._docweight))  # total # of docs
+        meandoclen = self._totaldoclen / N
+        K1 = self.K1
+        B = self.B
+        K1_plus1 = K1 + 1.0
+        B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = IIBucket()
+            for docid, f in d2f.items():
+                lenweight = B_from1 + B * docid2len[docid] / meandoclen
+                tf = f * K1_plus1 / (f + K1 * lenweight)
+                result[docid] = scaled_int(tf * idf)
+            L.append((result, 1))
+        return L
+
+        # Note about the above:  the result is tf * idf.  tf is small -- it
+        # can't be larger than k1+1 = 2.2.  idf is formally unbounded, but
+        # is less than 14 for a term that appears in only 1 of a million
+        # documents.  So the product is probably less than 32, or 5 bits
+        # before the radix point.  If we did the scaled-int business on
+        # both of them, we'd be up to 25 bits.  Add 64 of those and we'd
+        # be in overflow territory.  That's pretty unlikely, so we *could*
+        # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
+        # as an invariant weight across the whole result.  But besides
+        # skating near the edge, it's not a speed cure, since the computation
+        # of tf would still be done at Python speed, and it's a lot more
+        # work than just multiplying by idf.
+
+    # The same function as _search_wids above, but with the inner scoring
+    # loop written in C (module okascore, function score()).
+    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
+    # function.
+    def _search_wids_NOTYET(self, wids):
+        if not wids:
+            return []
+        N = float(len(self._docweight))  # total # of docs
+        meandoclen = self._totaldoclen / N
+        #K1 = self.K1
+        #B = self.B
+        #K1_plus1 = K1 + 1.0
+        #B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = IIBucket()
+            score(result, d2f.items(), docid2len, idf, meandoclen)
+            L.append((result, 1))
+        return L
+
+    def query_weight(self, terms):
+        # Get the wids.
+        wids = []
+        for term in terms:
+            termwids = self._lexicon.termToWordIds(term)
+            wids.extend(termwids)
+        # The max score for term t is the maximum value of
+        #     TF(D, t) * IDF(Q, t)
+        # We can compute IDF directly, and as noted in the comments below
+        # TF(D, t) is bounded above by 1+K1.
+        N = float(len(self._docweight))
+        tfmax = 1.0 + self.K1
+        sum = 0
+        for t in self._remove_oov_wids(wids):
+            idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
+            sum += scaled_int(idf * tfmax)
+        return sum
+
+    def _get_frequencies(self, wids):
+        d = {}
+        dget = d.get
+        for wid in wids:
+            d[wid] = dget(wid, 0) + 1
+        return d, len(wids)
+
+"""
+"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
+It's based on probability arguments about how words are distributed in
+documents, not on an abstract vector space model.  A long paper by its
+principal inventors gives an excellent overview of how it was derived:
+
+    A probabilistic model of information retrieval:  development and status
+    K. Sparck Jones, S. Walker, S.E. Robertson
+    http://citeseer.nj.nec.com/jones98probabilistic.html
+
+Spellings that ignore relevance information (which we don't have) are of this
+high-level form:
+
+    score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
+
+where
+
+    D         a specific document
+
+    Q         a specific query
+
+    t         a term (word, atomic phrase, whatever)
+
+    D&Q       the terms common to D and Q
+
+    TF(D, t)  a measure of t's importance in D -- a kind of term frequency
+              weight
+
+    IDF(Q, t) a measure of t's importance in the query and in the set of
+              documents as a whole -- a kind of inverse document frequency
+              weight
+
+The IDF(Q, t) here is identical to the one used for our cosine measure.
+Since queries are expected to be short, it ignores Q entirely:
+
+   IDF(Q, t) = log(1.0 + N / f(t))
+
+where
+
+   N        the total number of documents
+   f(t)     the number of documents in which t appears
+
+Most Okapi literature seems to use log(N/f(t)) instead.  We don't, because
+that becomes 0 for a term that's in every document, and, e.g., if someone
+is searching for "documentation" on python.org (a term that may well show
+up on every page, due to the top navigation bar), we still want to find the
+pages that use the word a lot (which is TF's job to find, not IDF's -- we
+just want to stop IDF from considering this t to be irrelevant).
+
+The TF(D, t) spellings are more interesting.  With lots of variations, the
+most basic spelling is of the form
+
+                   f(D, t)
+    TF(D, t) = ---------------
+                f(D, t) + K(D)
+
+where
+
+    f(D, t)   the number of times t appears in D
+    K(D)      a measure of the length of D, normalized to mean doc length
+
+The functional *form* f/(f+K) is clever.  It's a gross approximation to a
+mixture of two distinct Poisson distributions, based on the idea that t
+probably appears in D for one of two reasons:
+
+1. More or less at random.
+
+2. Because it's important to D's purpose in life ("eliteness" in papers).
+
+Note that f/(f+K) is always between 0 and 1.  If f is very large compared to
+K, it approaches 1.  If K is very large compared to f, it approaches 0.  If
+t appears in D more or less "for random reasons", f is likely to be small,
+and so K will dominate unless it's a very small doc, and the ratio will be
+small.  OTOH, if t appears a lot in D, f will dominate unless it's a very
+large doc, and the ratio will be close to 1.
+
+We use a variation on that simple theme, a simplification of what's called
+BM25 in the literature (it was the 25th stab at a Best Match function from
+the Okapi group; "a simplification" means we're setting some of BM25's more
+esoteric free parameters to 0):
+
+                f(D, t) * (k1 + 1)
+    TF(D, t) = --------------------
+                f(D, t) + k1 * K(D)
+
+where
+
+    k1      a "tuning factor", typically between 1.0 and 2.0.  We use 1.2,
+            the usual default value.  This constant adjusts the curve to
+            look more like a theoretical 2-Poisson curve.
+
+Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
+an asymptote of k1+1 from below.
+
+Finally, we use
+
+    K(D) = (1-b) + b * len(D)/E(len(D))
+
+where
+
+    b           is another free parameter, discussed below.  We use 0.75.
+
+    len(D)      the length of D in words
+
+    E(len(D))   the expected value of len(D) across the whole document set;
+                or, IOW, the average document length
+
+b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
+of the "Verbosity Hypothesis".  Suppose b is 1, and some word t appears
+10 times as often in document d2 than in document d1.  If document d2 is
+also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
+
+                     f(d2, t) * (k1 + 1)
+   TF(d2, t) = --------------------------------- =
+                f(d2, t) + k1 * len(d2)/E(len(D))
+
+                            10 * f(d1, t) * (k1 + 1)
+               ----------------------------------------------- = TF(d1, t)
+                10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
+
+because the 10's cancel out.  This is appropriate if we believe that a word
+appearing 10x more often in a doc 10x as long is simply due to that the
+longer doc is more verbose.  If we do believe that, the longer doc and the
+shorter doc are probably equally relevant.  OTOH, it *could* be that the
+longer doc is talking about t in greater depth too, in which case it's
+probably more relevant than the shorter doc.
+
+At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
+completely, and a doc scores higher for having more occurences of a word
+regardless of the doc's length.
+
+Reality is between these extremes, and probably varies by document and word
+too.  Reports in the literature suggest that b=0.75 is a good compromise "in
+general", favoring the "verbosity hypothesis" end of the scale.
+
+Putting it all together, the final TF function is
+
+                           f(D, t) * (k1 + 1)
+    TF(D, t) = --------------------------------------------
+                f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+with k1=1.2 and b=0.75.
+
+
+Query Term Weighting
+--------------------
+
+I'm ignoring the query adjustment part of Okapi BM25 because I expect our
+queries are very short.  Full BM25 takes them into account by adding the
+following to every score(D, Q); it depends on the lengths of D and Q, but
+not on the specific words in Q, or even on whether they appear in D(!):
+
+                   E(len(D)) - len(D)
+    k2 * len(Q) * -------------------
+                   E(len(D)) + len(D)
+
+Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
+len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
+so it apparently doesn't do much good (or may even hurt).
+
+Full BM25 *also* multiplies the following factor into IDF(Q, t):
+
+    f(Q, t) * (k3 + 1)
+    ------------------
+       f(Q, t) + k3
+
+where k3 is yet another free parameter, and f(Q,t) is the number of times t
+appears in Q.  Since we're using short "web style" queries, I expect f(Q,t)
+to always be 1, and then that quotient is
+
+     1 * (k3 + 1)
+     ------------ = 1
+        1 + k3
+
+regardless of k3's value.  So, in a trivial sense, we are incorporating
+this measure (and optimizing it by not bothering to multiply by 1 <wink>).
+"""


=== Zope3/src/zope/textindex/parsetree.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/parsetree.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,132 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Generic parser support: exception and parse tree nodes."""
+
+from zodb.btrees.IIBTree import difference
+
+from zope.textindex.iqueryparsetree import IQueryParseTree
+from zope.textindex.setops import mass_weightedIntersection, \
+                                  mass_weightedUnion
+
+class QueryError(Exception):
+    pass
+
+class ParseError(Exception):
+    pass
+
+class ParseTreeNode:
+
+    __implements__ = IQueryParseTree
+
+    _nodeType = None
+
+    def __init__(self, value):
+        self._value = value
+
+    def nodeType(self):
+        return self._nodeType
+
+    def getValue(self):
+        return self._value
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.getValue())
+
+    def terms(self):
+        t = []
+        for v in self.getValue():
+            t.extend(v.terms())
+        return t
+
+    def executeQuery(self, index):
+        raise NotImplementedError
+
+class NotNode(ParseTreeNode):
+
+    _nodeType = "NOT"
+
+    def terms(self):
+        return []
+
+    def executeQuery(self, index):
+        raise QueryError, "NOT parse tree node cannot be executed directly"
+
+class AndNode(ParseTreeNode):
+
+    _nodeType = "AND"
+
+    def executeQuery(self, index):
+        L = []
+        Nots = []
+        for subnode in self.getValue():
+            if subnode.nodeType() == "NOT":
+                r = subnode.getValue().executeQuery(index)
+                # If None, technically it matches every doc, but we treat
+                # it as if it matched none (we want
+                #     real_word AND NOT stop_word
+                # to act like plain real_word).
+                if r is not None:
+                    Nots.append((r, 1))
+            else:
+                r = subnode.executeQuery(index)
+                # If None, technically it matches every doc, so needn't be
+                # included.
+                if r is not None:
+                    L.append((r, 1))
+        set = mass_weightedIntersection(L)
+        if Nots:
+            notset = mass_weightedUnion(Nots)
+            set = difference(set, notset)
+        return set
+
+class OrNode(ParseTreeNode):
+
+    _nodeType = "OR"
+
+    def executeQuery(self, index):
+        weighted = []
+        for node in self.getValue():
+            r = node.executeQuery(index)
+            # If None, technically it matches every doc, but we treat
+            # it as if it matched none (we want
+            #     real_word OR stop_word
+            # to act like plain real_word).
+            if r is not None:
+                weighted.append((r, 1))
+        return mass_weightedUnion(weighted)
+
+class AtomNode(ParseTreeNode):
+
+    _nodeType = "ATOM"
+
+    def terms(self):
+        return [self.getValue()]
+
+    def executeQuery(self, index):
+        return index.search(self.getValue())
+
+class PhraseNode(AtomNode):
+
+    _nodeType = "PHRASE"
+
+    def executeQuery(self, index):
+        return index.search_phrase(self.getValue())
+
+class GlobNode(AtomNode):
+
+    _nodeType = "GLOB"
+
+    def executeQuery(self, index):
+        return index.search_glob(self.getValue())


=== Zope3/src/zope/textindex/pipelinefactory.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/pipelinefactory.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,52 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from zope.textindex.ipipelineelementfactory \
+     import IPipelineElementFactory
+
+class PipelineElementFactory:
+
+    __implements__ = IPipelineElementFactory
+
+    def __init__(self):
+        self._groups = {}
+
+    def registerFactory(self, group, name, factory):
+        if self._groups.has_key(group) and \
+           self._groups[group].has_key(name):
+            raise ValueError('ZCTextIndex lexicon element "%s" '
+                             'already registered in group "%s"'
+                             % (name, group))
+
+        elements = self._groups.get(group)
+        if elements is None:
+            elements = self._groups[group] = {}
+        elements[name] = factory
+
+    def getFactoryGroups(self):
+        groups = self._groups.keys()
+        groups.sort()
+        return groups
+
+    def getFactoryNames(self, group):
+        names = self._groups[group].keys()
+        names.sort()
+        return names
+
+    def instantiate(self, group, name):
+        factory = self._groups[group][name]
+        if factory is not None:
+            return factory()
+
+element_factory = PipelineElementFactory()


=== Zope3/src/zope/textindex/queryparser.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/queryparser.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,243 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser.
+
+This particular parser recognizes the following syntax:
+
+Start = OrExpr
+OrExpr = AndExpr ('OR' AndExpr)*
+AndExpr = Term ('AND' NotExpr)*
+NotExpr = ['NOT'] Term
+Term = '(' OrExpr ')' | ATOM+
+
+The key words (AND, OR, NOT) are recognized in any mixture of case.
+
+An ATOM is either:
+
++ A sequence of characters not containing whitespace or parentheses or
+  double quotes, and not equal (ignoring case) to one of the key words
+  'AND', 'OR', 'NOT'; or
+
++ A non-empty string enclosed in double quotes.  The interior of the
+  string can contain whitespace, parentheses and key words, but not
+  quotes.
+
++ A hyphen followed by one of the two forms above, meaning that it
+  must not be present.
+
+An unquoted ATOM may also contain globbing characters.  Globbing
+syntax is defined by the lexicon; for example "foo*" could mean any
+word starting with "foo".
+
+When multiple consecutive ATOMs are found at the leaf level, they are
+connected by an implied AND operator, and an unquoted leading hyphen
+is interpreted as a NOT operator.
+
+Summarizing the default operator rules:
+
+- a sequence of words without operators implies AND, e.g. ``foo bar''
+- double-quoted text implies phrase search, e.g. ``"foo bar"''
+- words connected by punctuation implies phrase search, e.g. ``foo-bar''
+- a leading hyphen implies NOT, e.g. ``foo -bar''
+- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
+- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
+"""
+
+import re
+
+from zope.textindex.iqueryparser import IQueryParser
+from zope.textindex import parsetree
+
+# Create unique symbols for token types.
+_AND    = intern("AND")
+_OR     = intern("OR")
+_NOT    = intern("NOT")
+_LPAREN = intern("(")
+_RPAREN = intern(")")
+_ATOM   = intern("ATOM")
+_EOF    = intern("EOF")
+
+# Map keyword string to token type.
+_keywords = {
+    _AND:       _AND,
+    _OR:        _OR,
+    _NOT:       _NOT,
+    _LPAREN:    _LPAREN,
+    _RPAREN:    _RPAREN,
+}
+
+# Regular expression to tokenize.
+_tokenizer_regex = re.compile(r"""
+    # a paren
+    [()]
+    # or an optional hyphen
+|   -?
+    # followed by
+    (?:
+        # a string inside double quotes (and not containing these)
+        " [^"]* "
+        # or a non-empty stretch w/o whitespace, parens or double quotes
+    |    [^()\s"]+
+    )
+""", re.VERBOSE)
+
+class QueryParser:
+
+    __implements__ = IQueryParser
+
+    # This class is not thread-safe;
+    # each thread should have its own instance
+
+    def __init__(self, lexicon):
+        self._lexicon = lexicon
+        self._ignored = None
+
+    # Public API methods
+
+    def parseQuery(self, query):
+        # Lexical analysis.
+        tokens = _tokenizer_regex.findall(query)
+        self._tokens = tokens
+        # classify tokens
+        self._tokentypes = [_keywords.get(token.upper(), _ATOM)
+                            for token in tokens]
+        # add _EOF
+        self._tokens.append(_EOF)
+        self._tokentypes.append(_EOF)
+        self._index = 0
+
+        # Syntactical analysis.
+        self._ignored = [] # Ignored words in the query, for parseQueryEx
+        tree = self._parseOrExpr()
+        self._require(_EOF)
+        if tree is None:
+            raise parsetree.ParseError(
+                "Query contains only common words: %s" % repr(query))
+        return tree
+
+    def getIgnored(self):
+        return self._ignored
+
+    def parseQueryEx(self, query):
+        tree = self.parseQuery(query)
+        ignored = self.getIgnored()
+        return tree, ignored
+
+    # Recursive descent parser
+
+    def _require(self, tokentype):
+        if not self._check(tokentype):
+            t = self._tokens[self._index]
+            msg = "Token %r required, %r found" % (tokentype, t)
+            raise parsetree.ParseError, msg
+
+    def _check(self, tokentype):
+        if self._tokentypes[self._index] is tokentype:
+            self._index += 1
+            return 1
+        else:
+            return 0
+
+    def _peek(self, tokentype):
+        return self._tokentypes[self._index] is tokentype
+
+    def _get(self, tokentype):
+        t = self._tokens[self._index]
+        self._require(tokentype)
+        return t
+
+    def _parseOrExpr(self):
+        L = []
+        L.append(self._parseAndExpr())
+        while self._check(_OR):
+            L.append(self._parseAndExpr())
+        L = filter(None, L)
+        if not L:
+            return None # Only stopwords
+        elif len(L) == 1:
+            return L[0]
+        else:
+            return parsetree.OrNode(L)
+
+    def _parseAndExpr(self):
+        L = []
+        t = self._parseTerm()
+        if t is not None:
+            L.append(t)
+        Nots = []
+        while self._check(_AND):
+            t = self._parseNotExpr()
+            if t is None:
+                continue
+            if isinstance(t, parsetree.NotNode):
+                Nots.append(t)
+            else:
+                L.append(t)
+        if not L:
+            return None # Only stopwords
+        L.extend(Nots)
+        if len(L) == 1:
+            return L[0]
+        else:
+            return parsetree.AndNode(L)
+
+    def _parseNotExpr(self):
+        if self._check(_NOT):
+            t = self._parseTerm()
+            if t is None:
+                return None # Only stopwords
+            return parsetree.NotNode(t)
+        else:
+            return self._parseTerm()
+
+    def _parseTerm(self):
+        if self._check(_LPAREN):
+            tree = self._parseOrExpr()
+            self._require(_RPAREN)
+        else:
+            nodes = []
+            nodes = [self._parseAtom()]
+            while self._peek(_ATOM):
+                nodes.append(self._parseAtom())
+            nodes = filter(None, nodes)
+            if not nodes:
+                return None # Only stopwords
+            structure = [(isinstance(nodes[i], parsetree.NotNode), i, nodes[i])
+                         for i in range(len(nodes))]
+            structure.sort()
+            nodes = [node for (bit, index, node) in structure]
+            if isinstance(nodes[0], parsetree.NotNode):
+                raise parsetree.ParseError(
+                    "a term must have at least one positive word")
+            if len(nodes) == 1:
+                return nodes[0]
+            tree = parsetree.AndNode(nodes)
+        return tree
+
+    def _parseAtom(self):
+        term = self._get(_ATOM)
+        words = self._lexicon.parseTerms(term)
+        if not words:
+            self._ignored.append(term)
+            return None
+        if len(words) > 1:
+            tree = parsetree.PhraseNode(words)
+        elif self._lexicon.isGlob(words[0]):
+            tree = parsetree.GlobNode(words[0])
+        else:
+            tree = parsetree.AtomNode(words[0])
+        if term[0] == "-":
+            tree = parsetree.NotNode(tree)
+        return tree


=== Zope3/src/zope/textindex/ricecode.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/ricecode.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,208 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Rice coding (a variation of Golomb coding)
+
+Based on a Java implementation by Glen McCluskey described in a Usenix
+ ;login: article at
+http://www.usenix.org/publications/login/2000-4/features/java.html
+
+McCluskey's article explains the approach as follows.  The encoding
+for a value x is represented as a unary part and a binary part.  The
+unary part is a sequence of 1 bits followed by a 0 bit.  The binary
+part encodes some of the lower bits of x-1.
+
+The encoding is parameterized by a value m that describes how many
+bits to store in the binary part.  If most of the values are smaller
+than 2**m then they can be stored in only m+1 bits.
+
+Compute the length of the unary part, q, where
+   q = math.floor((x-1)/ 2 ** m)
+
+   Emit q 1 bits followed by a 0 bit.
+
+Emit the lower m bits of x-1, treating x-1 as a binary value.
+"""
+
+import array
+
+class BitArray:
+
+    def __init__(self, buf=None):
+        self.bytes = array.array('B')
+        self.nbits = 0
+        self.bitsleft = 0
+        self.tostring = self.bytes.tostring
+
+    def __getitem__(self, i):
+        byte, offset = divmod(i, 8)
+        mask = 2 ** offset
+        if self.bytes[byte] & mask:
+            return 1
+        else:
+            return 0
+
+    def __setitem__(self, i, val):
+        byte, offset = divmod(i, 8)
+        mask = 2 ** offset
+        if val:
+            self.bytes[byte] |= mask
+        else:
+            self.bytes[byte] &= ~mask
+
+    def __len__(self):
+        return self.nbits
+
+    def append(self, bit):
+        """Append a 1 if bit is true or 1 if it is false."""
+        if self.bitsleft == 0:
+            self.bytes.append(0)
+            self.bitsleft = 8
+        self.__setitem__(self.nbits, bit)
+        self.nbits += 1
+        self.bitsleft -= 1
+
+    def __getstate__(self):
+        return self.nbits, self.bitsleft, self.tostring()
+
+    def __setstate__(self, (nbits, bitsleft, s)):
+        self.bytes = array.array('B', s)
+        self.nbits = nbits
+        self.bitsleft = bitsleft
+
+class RiceCode:
+    def __init__(self, m):
+        """Constructor a RiceCode for m-bit values."""
+        if not (0 <= m <= 16):
+            raise ValueError, "m must be between 0 and 16"
+        self.init(m)
+        self.bits = BitArray()
+        self.len = 0
+
+    def init(self, m):
+        self.m = m
+        self.lower = (1 << m) - 1
+        self.mask = 1 << (m - 1)
+
+    def append(self, val):
+        """Append an item to the list."""
+        if val < 1:
+            raise ValueError, "value >= 1 expected, got %s" % `val`
+        val -= 1
+        # emit the unary part of the code
+        q = val >> self.m
+        for i in range(q):
+            self.bits.append(1)
+        self.bits.append(0)
+        # emit the binary part
+        r = val & self.lower
+        mask = self.mask
+        while mask:
+            self.bits.append(r & mask)
+            mask >>= 1
+        self.len += 1
+
+    def __len__(self):
+        return self.len
+
+    def tolist(self):
+        """Return the items as a list."""
+        l = []
+        i = 0 # bit offset
+        binary_range = range(self.m)
+        for j in range(self.len):
+            unary = 0
+            while self.bits[i] == 1:
+                unary += 1
+                i += 1
+            assert self.bits[i] == 0
+            i += 1
+            binary = 0
+            for k in binary_range:
+                binary = (binary << 1) | self.bits[i]
+                i += 1
+            l.append((unary << self.m) + (binary + 1))
+        return l
+
+    def tostring(self):
+        """Return a binary string containing the encoded data.
+
+        The binary string may contain some extra zeros at the end.
+        """
+        return self.bits.tostring()
+
+    def __getstate__(self):
+        return self.m, self.bits
+
+    def __setstate__(self, (m, bits)):
+        self.init(m)
+        self.bits = bits
+
+def encode(m, l):
+    c = RiceCode(m)
+    for elt in l:
+        c.append(elt)
+    assert c.tolist() == l
+    return c
+
+def encode_deltas(l):
+    if len(l) == 1:
+        return l[0], []
+    deltas = RiceCode(6)
+    deltas.append(l[1] - l[0])
+    for i in range(2, len(l)):
+        deltas.append(l[i] - l[i - 1])
+    return l[0], deltas
+
+def decode_deltas(start, enc_deltas):
+    deltas = enc_deltas.tolist()
+    l = [start]
+    for i in range(1, len(deltas)):
+        l.append(l[i-1] + deltas[i])
+    l.append(l[-1] + deltas[-1])
+    return l
+
+def test():
+    import random
+    for size in [10, 20, 50, 100, 200]:
+        l = [random.randint(1, size) for i in range(50)]
+        c = encode(random.randint(1, 16), l)
+        assert c.tolist() == l
+    for size in [10, 20, 50, 100, 200]:
+        l = range(random.randint(1, size), size + random.randint(1, size))
+        t = encode_deltas(l)
+        l2 = decode_deltas(*t)
+        assert l == l2
+        if l != l2:
+            print l
+            print l2
+
+def pickle_efficiency():
+    import pickle
+    import random
+    for m in [4, 8, 12]:
+        for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
+            for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
+                l = [random.randint(1, elt_range) for i in range(size)]
+                raw = pickle.dumps(l, 1)
+                enc = pickle.dumps(encode(m, l), 1)
+                print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
+                print "%5d %5d" % (len(raw), len(enc)),
+                if len(raw) > len(enc):
+                    print "win"
+                else:
+                    print "lose"
+
+if __name__ == "__main__":
+    test()


=== Zope3/src/zope/textindex/setops.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/setops.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,63 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""SetOps -- Weighted intersections and unions applied to many inputs."""
+
+from zodb.btrees.IIBTree import \
+     IIBucket, weightedIntersection, weightedUnion
+
+from zope.textindex.nbest import NBest
+
+def mass_weightedIntersection(L):
+    "A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
+    L = [(x, wx) for (x, wx) in L if x is not None]
+    if len(L) < 2:
+        return _trivial(L)
+    # Intersect with smallest first.  We expect the input maps to be
+    # IIBuckets, so it doesn't hurt to get their lengths repeatedly
+    # (len(Bucket) is fast; len(BTree) is slow).
+    L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+    (x, wx), (y, wy) = L[:2]
+    dummy, result = weightedIntersection(x, y, wx, wy)
+    for x, wx in L[2:]:
+        dummy, result = weightedIntersection(result, x, 1, wx)
+    return result
+
+def mass_weightedUnion(L):
+    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
+    if len(L) < 2:
+        return _trivial(L)
+    # Balance unions as closely as possible, smallest to largest.
+    merge = NBest(len(L))
+    for x, weight in L:
+        merge.add((x, weight), len(x))
+    while len(merge) > 1:
+        # Merge the two smallest so far, and add back to the queue.
+        (x, wx), dummy = merge.pop_smallest()
+        (y, wy), dummy = merge.pop_smallest()
+        dummy, z = weightedUnion(x, y, wx, wy)
+        merge.add((z, 1), len(z))
+    (result, weight), dummy = merge.pop_smallest()
+    return result
+
+def _trivial(L):
+    # L is empty or has only one (mapping, weight) pair.  If there is a
+    # pair, we may still need to multiply the mapping by its weight.
+    assert len(L) <= 1
+    if len(L) == 0:
+        return IIBucket()
+    [(result, weight)] = L
+    if weight != 1:
+        dummy, result = weightedUnion(IIBucket(), result, 0, weight)
+    return result


=== Zope3/src/zope/textindex/stopdict.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/stopdict.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,36 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Provide a default list of stop words for the index.
+
+The specific splitter and lexicon are customizable, but the default
+ZCTextIndex should do something useful.
+"""
+
+def get_stopdict():
+    """Return a dictionary of stopwords."""
+    return _dict
+
+# This list of English stopwords comes from Lucene
+_words = [
+    "a", "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+]
+
+_dict = {}
+for w in _words:
+    _dict[w] = None


=== Zope3/src/zope/textindex/textindexinterfaces.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/textindexinterfaces.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,68 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Interfaces for a text index.
+
+$Id$
+"""
+
+from zope.interface import Interface
+
+
+class IInjection(Interface):
+    """Interface for injecting documents into an index."""
+
+    def index_doc(docid, texts):
+        """Add a document to the index.
+
+        docid: int, identifying the document
+        texts: list of unicode, the text to be indexed in a list
+        return: None
+
+        This can also be used to reindex documents.
+        """
+
+    def unindex_doc(docid):
+        """Remove a document from the index.
+
+        docid: int, identifying the document
+        return: None
+
+        If docid does not exist, KeyError is raised.
+        """
+
+class IQuerying(Interface):
+
+    def query(querytext, start=0, count=None):
+        """Execute a query.
+
+        querytext: unicode, the query expression
+        start: the first result to return (0-based)
+        count: the maximum number of results to return (default: all)
+        return: ([(docid, rank), ...], total)
+
+        The return value is a triple:
+            matches: list of (int, float) tuples, docid and rank
+            total: int, the total number of matches
+
+        The matches list represents the requested batch.  The ranks
+        are floats between 0 and 1 (inclusive).
+        """
+
+class IStatistics(Interface):
+
+    def documentCount():
+        """Return the number of documents currently indexed."""
+
+    def wordCount():
+        """Return the number of words currently indexed."""


=== Zope3/src/zope/textindex/textindexwrapper.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/textindexwrapper.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,88 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Text index wrapper.
+
+This exists to implement IInjection and IQuerying.
+
+$Id$
+"""
+
+from persistence import Persistent
+
+from zope.textindex.okapiindex import OkapiIndex
+from zope.textindex.lexicon import Lexicon
+from zope.textindex.lexicon import Splitter, CaseNormalizer, StopWordRemover
+from zope.textindex.queryparser import QueryParser
+from zope.textindex.nbest import NBest
+
+from zope.textindex.textindexinterfaces import \
+     IInjection, IQuerying, IStatistics
+
+class TextIndexWrapper(Persistent):
+
+    __implements__ = (IInjection, IQuerying, IStatistics)
+
+    def __init__(self, lexicon=None, index=None):
+        """Provisional constructor.
+
+        This creates the lexicon and index if not passed in."""
+        if lexicon is None:
+            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+        if index is None:
+            index = OkapiIndex(lexicon)
+        self.lexicon = lexicon
+        self.index = index
+
+    # Methods implementing IInjection
+
+    def index_doc(self, docid, text):
+        self.index.index_doc(docid, text)
+        self._p_changed = 1 # XXX why is this needed?
+
+    def unindex_doc(self, docid):
+        self.index.unindex_doc(docid)
+        self._p_changed = 1 # XXX why is this needed?
+
+    # Methods implementing IQuerying
+
+    def query(self, querytext, start=0, count=None):
+        parser = QueryParser(self.lexicon)
+        tree = parser.parseQuery(querytext)
+        results = tree.executeQuery(self.index)
+        if not results:
+            return [], 0
+        if count is None:
+            count = max(0, len(results) - start)
+        chooser = NBest(start + count)
+        chooser.addmany(results.items())
+        batch = chooser.getbest()
+        batch = batch[start:]
+        if batch:
+            qw = self.index.query_weight(tree.terms())
+            # Hack to avoid ZeroDivisionError
+            if qw == 0:
+                qw = batch[0][1] or 1
+            qw *= 1.0
+            batch = [(docid, score/qw) for docid, score in batch]
+        return batch, len(results)
+
+    # Methods implementing IStatistics
+
+    def documentCount(self):
+        """Return the number of documents in the index."""
+        return self.index.documentCount()
+
+    def wordCount(self):
+        """Return the number of words in the index."""
+        return self.index.wordCount()


=== Zope3/src/zope/textindex/widcode.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:07 2002
+++ Zope3/src/zope/textindex/widcode.py	Wed Dec 25 09:15:34 2002
@@ -0,0 +1,131 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
+# for smaller ints.  This is intended for lists of word ids (wids).  The
+# ordinary string .find() method can be used to find the encoded form of a
+# desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
+# of an encoding can't appear in the interior of an encoding, so find() can't
+# be fooled into starting a match "in the middle" of an encoding. Unlike
+# UTF-8, the initial byte does not tell you how many continuation bytes
+# follow; and there's no ASCII superset property.
+
+# Details:
+#
+# + Only the first byte of an encoding has the sign bit set.
+#
+# + The first byte has 7 bits of data.
+#
+# + Bytes beyond the first in an encoding have the sign bit clear, followed
+#   by 7 bits of data.
+#
+# + The first byte doesn't tell you how many continuation bytes are
+#   following.  You can tell by searching for the next byte with the
+#   high bit set (or the end of the string).
+#
+# The int to be encoded can contain no more than 28 bits.
+#
+# If it contains no more than 7 bits, 0abcdefg, the encoding is
+#     1abcdefg
+#
+# If it contains 8 thru 14 bits,
+#     00abcdef ghijkLmn
+# the encoding is
+#     1abcdefg 0hijkLmn
+#
+# Static tables _encoding and _decoding capture all encodes and decodes for
+# 14 or fewer bits.
+#
+# If it contains 15 thru 21 bits,
+#    000abcde fghijkLm nopqrstu
+# the encoding is
+#    1abcdefg 0hijkLmn 0opqrstu
+#
+# If it contains 22 thru 28 bits,
+#    0000abcd efghijkL mnopqrst uvwxyzAB
+# the encoding is
+#    1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
+
+assert 0x80**2 == 0x4000
+assert 0x80**4 == 0x10000000
+
+import re
+
+def encode(wids):
+    # Encode a list of wids as a string.
+    wid2enc = _encoding
+    n = len(wid2enc)
+    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
+
+_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
+
+def _encode(w):
+    assert 0x4000 <= w < 0x10000000
+    b, c = divmod(w, 0x80)
+    a, b = divmod(b, 0x80)
+    s = chr(b) + chr(c)
+    if a < 0x80:    # no more than 21 data bits
+        return chr(a + 0x80) + s
+    a, b = divmod(a, 0x80)
+    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
+    return (chr(a + 0x80) + chr(b)) + s
+
+_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
+
+def decode(code):
+    # Decode a string into a list of wids.
+    get = _decoding.get
+    # Obscure:  while _decoding does have the key '\x80', its value is 0,
+    # so the "or" here calls _decode('\x80') anyway.
+    return [get(p) or _decode(p) for p in _prog.findall(code)]
+
+_decoding = {} # Filled later
+
+def _decode(s):
+    if s == '\x80':
+        # See comment in decode().  This is here to allow a trick to work.
+        return 0
+    if len(s) == 3:
+        a, b, c = map(ord, s)
+        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
+        return ((a & 0x7F) << 14) | (b << 7) | c
+    assert len(s) == 4, `s`
+    a, b, c, d = map(ord, s)
+    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
+    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
+
+def _fill():
+    global _encoding
+    for i in range(0x80):
+        s = chr(i + 0x80)
+        _encoding[i] = s
+        _decoding[s] = i
+    for i in range(0x80, 0x4000):
+        hi, lo = divmod(i, 0x80)
+        s = chr(hi + 0x80) + chr(lo)
+        _encoding[i] = s
+        _decoding[s] = i
+    _encoding = tuple(_encoding)
+
+_fill()
+
+def test():
+    for i in range(2**20):
+        if i % 1000 == 0: print i
+        wids = [i]
+        code = encode(wids)
+        assert decode(code) == wids, (wids, code, decode(code))
+
+if __name__ == "__main__":
+    test()