[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.4 CosineIndex.py:1.8 OkapiIndex.py:1.15

Tim Peters tim.one@comcast.net
Fri, 17 May 2002 01:36:09 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv686

Modified Files:
	BaseIndex.py CosineIndex.py OkapiIndex.py 
Log Message:
Refactor/combine _docweight/_doclen.


=== Products/ZCTextIndex/BaseIndex.py 1.3 => 1.4 ===
         # wid -> {docid -> weight}; t -> D -> w(D, t)
         # Different indexers have different notions of term weight, but we
-        # expect all indexers to use ._wordinfo to map wids to its notion
+        # expect each indexer to use ._wordinfo to map wids to its notion
         # of a docid-to-weight map.
         # There are two kinds of OOV words:  wid 0 is explicitly OOV,
         # and it's possible that the lexicon will return a non-zero wid
@@ -63,6 +63,12 @@
         # this index if and only if _wordinfo.has_key(wid).  Note that
         # wid 0 must not be a key in _wordinfo.
         self._wordinfo = IOBTree()
+
+        # docid -> weight
+        # Different indexers have different notions of doc weight, but we
+        # expect each indexer to use ._docweight to map docids to its
+        # notion of what a doc weight is.
+        self._docweight = IIBTree()
 
         # docid -> WidCode'd list of wids
         # Used for un-indexing, and for phrase search.


=== Products/ZCTextIndex/CosineIndex.py 1.7 => 1.8 ===
         # t -> D -> w(d, t)/W(d)
 
+        # ._docweight for Okapi is
         # docid -> W(docid)
-        self._docweight = IIBTree()
 
     # Most of the computation for computing a relevance score for the
     # document occurs in the search() method.  The code currently


=== Products/ZCTextIndex/OkapiIndex.py 1.14 => 1.15 ===
         # wid -> {docid -> frequency}; t -> D -> f(D, t)
 
+        # ._docweight for Okapi is
         # docid -> # of words in the doc
         # This is just len(self._docwords[docid]), but _docwords is stored
         # in compressed form, so uncompressing it just to count the list
         # length would be ridiculously expensive.
-        self._doclen = IIBTree()
 
-        # sum(self._doclen.values()), the total # of words in all docs
+        # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
         self._totaldoclen = 0L
 
     def index_doc(self, docid, text):
         wids = self._lexicon.sourceToWordIds(text)
-        self._doclen[docid] = len(wids)
+        self._docweight[docid] = len(wids)
         self._totaldoclen += len(wids)
 
         wid2count = self._get_frequencies(wids)
@@ -92,8 +92,8 @@
 
         del self._docwords[docid]
 
-        count = self._doclen[docid]
-        del self._doclen[docid]
+        count = self._docweight[docid]
+        del self._docweight[docid]
         self._totaldoclen -= count
 
     # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
@@ -105,7 +105,7 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._doclen))  # total # of docs
+        N = float(len(self._docweight))  # total # of docs
         meandoclen = self._totaldoclen / N
         K1 = self.K1
         B = self.B
@@ -117,7 +117,7 @@
         #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
 
         L = []
-        docid2len = self._doclen
+        docid2len = self._docweight
         for t in wids:
             assert self._wordinfo.has_key(t)  # caller responsible for OOV
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}