[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.31

Mon, 6 May 2002 09:13:43 -0400

Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv15806

Modified Files:
      Tag: TextIndexDS9-branch
	Index.py 
Log Message:
This is the new Index.py, which stores scaled_int(w(d, t) / W(d)) in
_wordinfo[t], so that the inner loop in search() can be replaced by a
single constant scale factor, applied by weightedUnion.

With a small change to _get_wdt() and one change in ZCTest (because
the results are now scaled differently), this now passes the test
suite, so it can't be all bad. :-)

On a corpus of 73420 email messages, a search for an uncommon term is
now immeasurably fast (less than 10 msec); a search for a very common
term (occurring 39379 times) takes 250 msec, another even more common
term (52286 hits) takes 320 msec, and combining these two takes about
the sum of those times.

Tim brought up some good issues with the new approach, so it needs to
be tweaked more, especially to avoid losing too much precision: the
values stored in _wordinfo are small, which is good since it makes for
small pickles, but often very small, which gives us very little to
work with (and for collections with large documents this will be even
worse).  But first I'd like to work on refactoring how the query
engine and the query parser connect to the index.



=== Products/ZCTextIndex/Index.py 1.1.2.30 => 1.1.2.31 ===
 #
 ##############################################################################
-"""Text Index.
 
-Plugin text index for ZCatalog, with relevance ranking.
-"""
+"""Full text index with relevance ranking."""
 
 import math
 
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import IIBTree, IIBucket, IISet, weightedUnion
 
 from Products.ZCTextIndex.IIndex import IIndex
 
@@ -40,6 +38,7 @@
     return int(f * scale + 0.5)
 
 class Index:
+
     __implements__ = IIndex
 
     def __init__(self, lexicon):
@@ -55,6 +54,9 @@
         # used for un-indexing
         self._docwords = IOBTree()
 
+    def length(self):
+        return len(self._docweight)
+
     # Most of the computation for computing a relevance score for the
     # document occurs in the search() method.  The code currently
     # implements the cosine similarity function described in Managing
@@ -73,9 +75,7 @@
     #
     #    w(d, t) = 1 + log f(d, t)
     #        computed by doc_term_weight(); for a given word t,
-    #        self._wordinfo[t] is a map from d to w(d, t).  To save space
-    #        in binary pickles, we actually store f(d, t) if that's < 256.
-    #        In that case, WDT[f(d, t)] retrieves the correct w(d, t).
+    #        self._wordinfo[t] is a map from d to w(d, t).
     #
     #    w(q, t) = log(1 + N/f(t))
     #        computed by query_term_weight()
@@ -104,17 +104,21 @@
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
-        result = IIBucket()
+        if not wids:
+            return IIBucket()
         N = float(len(self._docweight))
+        L = []
         for wid in wids:
             d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
             idf = query_term_weight(len(d2w), N)  # this is an unscaled float
-            for docid, tf in d2w.items():
-                if tf < 256:
-                    tf = WDT[tf]
-                # scaled int * unscaled float / scaled int -> unscaled float
-                w = tf * idf / self._docweight[docid]
-                result[docid] = result.get(docid, 0) + scaled_int(w)
+            #print "idf = %.3f" % idf
+            if isinstance(d2w, type({})):
+                d2w = IIBucket(d2w)
+            L.append((d2w, scaled_int(idf)))
+        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+        result = IIBTree()
+        for d2w, weight in L:
+            dummy, result = weightedUnion(result, d2w, 1, weight)
         return result
 
     def query_weight(self, terms):
@@ -122,10 +126,10 @@
         for term in terms:
             wids += self._lexicon.termToWordIds(term)
         N = float(len(self._docweight))
-        sum = 0.
+        sum = 0.0
         for wid in wids:
-            wt = math.log(1. + N / len(self._wordinfo[wid]))
-            sum += wt ** 2.
+            wt = math.log(1.0 + N / len(self._wordinfo[wid]))
+            sum += wt ** 2.0
         return scaled_int(math.sqrt(sum))
 
     def _get_frequencies(self, wids):
@@ -138,17 +142,20 @@
         d = {}
         for wid in wids:
             d[wid] = d.get(wid, 0) + 1
-        Wsquares = 0.
+        Wsquares = 0.0
         weights = []
         push = weights.append
         for count in d.values():
             w = doc_term_weight(count)
             Wsquares += w * w
-            if count < 256:
-                push(count)         # 1 byte in a binary pickle
-            else:
-                push(scaled_int(w)) # 2 bytes in a binary pickle, and rare
-        return d.keys(), weights, scaled_int(math.sqrt(Wsquares))
+            push(w)
+        W = math.sqrt(Wsquares)
+        #print "W = %.3f" % W
+        for i in xrange(len(weights)):
+            #print i, ":", "%.3f" % weights[i],
+            weights[i] = scaled_int(weights[i] / W)
+            #print "->", weights[i]
+        return d.keys(), weights, scaled_int(W)
 
     DICT_CUTOFF = 10
 
@@ -177,34 +184,26 @@
         try:
             map = self._wordinfo[wid]
         except KeyError:
-            map = {}
-        # _add_wordinfo() is called for each update.  If the map size
-        # exceeds the DICT_CUTOFF, convert to an IIBTree.
-        if len(map) == self.DICT_CUTOFF:
-            map = IIBTree(map)
+            self._wordinfo[wid] = map = {}
+        else:
+            # _add_wordinfo() is called for each update.  If the map
+            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+            if len(map) == self.DICT_CUTOFF:
+                self._wordinfo[wid] = map = IIBTree(map)
         map[docid] = f
-        self._wordinfo[wid] = map
 
     def _del_wordinfo(self, wid, docid):
         map = self._wordinfo[wid]
-        if len(map) == 1:
-            assert map.has_key(docid)
-            del self._wordinfo[wid]
-            return
         del map[docid]
-        self._wordinfo[wid] = map
+        if len(map) == 0:
+            del self._wordinfo[wid]
+
+    # The rest are helper methods to support unit tests
 
-    # some helper method included to support unit tests
     def _get_wdt(self, d, t):
         wid, = self._lexicon.termToWordIds(t)
         map = self._wordinfo[wid]
-        w = map.get(d, 0)
-        if w == 0:
-            return 0
-        elif w < 256:
-            return WDT[w]
-        else:
-            return w
+        return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
 
     def _get_Wd(self, d):
         return self._docweight[d]
@@ -221,7 +220,7 @@
 def doc_term_weight(count):
     """Return the doc-term weight for a term that appears count times."""
     # implements w(d, t) = 1 + log f(d, t)
-    return 1. + math.log(count)
+    return 1.0 + math.log(count)
 
 def query_term_weight(term_count, num_items):
     """Return the query-term weight for a term,
@@ -230,17 +229,4 @@
     total items.
     """
     # implements w(q, t) = log(1 + N/f(t))
-    return math.log(1. + float(num_items) / term_count)
-
-# For 0 < i < 256, WDT[i] is the integer scaled value of doc_term_weight(i).
-# WDT[0] is None, because doc_term_weight(0) should never happen, and
-# returning None then should quickly lead to an exception.
-WDT = [scaled_int(doc_term_weight(i)) for i in range(1, 256)]
-del i
-WDT.insert(0, None)
-
-# Upon reading up a "w(d, t)" value, we need to know whether it's a true
-# scaled w(d,t), or a raw count that must be used to index WDT[] to get
-# the true scaled w(d,t).  The key is that for raw counts >= 256, their
-# scaled w(d,t) values are greater than 255.
-assert scaled_int(doc_term_weight(256)) > 255
+    return math.log(1.0 + float(num_items) / term_count)