[Zope-CVS] CVS: Products/ZCTextIndex - WidCode.py:1.1.2.1 IQueryParser.py:1.1.2.5 Index.py:1.1.2.38 ParseTree.py:1.1.2.3 QueryParser.py:1.1.2.14

Guido van Rossum guido@python.org
Fri, 10 May 2002 21:00:36 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv16638

Modified Files:
      Tag: TextIndexDS9-branch
	IQueryParser.py Index.py ParseTree.py QueryParser.py 
Added Files:
      Tag: TextIndexDS9-branch
	WidCode.py 
Log Message:
Add primitive phrase search.

This uses a "compression" scheme for lists of ints that I dubbed
"WidCode", and which uses an encoding somewhat similar to UTF-8.  It
only saves about 20 percent in pickle size over the (binary) pickle of
the list, but has the special property that you can use the string's
find() method to verify if a phrase occurs in a document.  Because
docwords now records *all* wids, in document order, rather than only a
list of unique wids, the WidCode-encoded list is probably *longer*
than what was stored in docwords before, but i hope it's not that much
longer.  The performance of WidCode could be improved by doing a
pre-scan over part of the corpus and assigning wids by frequency of
occurrence (the most frequent word gets wid 1, and so on).

Also still to do: change the query parser to recognize "words in
quotes" for phrase search.  It currently takes any sequence of words
without operators as a phrase search, i.e. the default operator is now
phrase search; but that's probably not what you want!  I did add a new
parse tree node, PhraseNode, which encodes a phrase search; there's
also a new Index method search_phrase().



=== Added File Products/ZCTextIndex/WidCode.py ===
# An efficient(?) encoding for lists of positive ints with many small ones

import re

def encode(wids):
    # Encode a list of wids as a string
    get = _encoding.get
    return "".join([get(w) or _encode(w) for w in wids])

_encoding = {} # Filled later

def _encode(w):
    assert 0x1000 <= w < 0x1000000
    b, c = divmod(w, 0x80)
    a, b = divmod(b, 0x80)
    s = chr(b) + chr(c)
    if a < 0x10:
        return chr(a + 0xE0) + s
    a, b = divmod(a, 0x80)
    assert a < 0x4, (w, a, b, s)
    return (chr(a + 0xF0) + chr(b)) + s

_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")

def decode(code):
    # Decode a string into a list of wids
    get = _decoding.get
    _prog.findall(code)
    return [get(p) or _decode(p) for p in _prog.findall(code)]

_decoding = {} # Filled later

def _decode(s):
    if s == '\x80':
        return 0
    if len(s) == 3:
        a = ord(s[0])
        b = ord(s[1])
        c = ord(s[2])
        assert a&0xF0 == 0xE0 and not b&0x80 and not c&0x80
        return ((a&0xF) << 14) | (b<<7) | c
    assert len(s) == 4, `s`
    a = ord(s[0])
    b = ord(s[1])
    c = ord(s[2])
    d = ord(s[3])
    assert a&0xF8 == 0xF0 and not b&0x80 and not c&0x80 and not d&0x80
    return ((a&0x7) << 21) | (b<<14) | (c<<7) | d

def _fill():
    for i in range(0x40):
        s = chr(i + 0x80)
        _encoding[i] = s
        _decoding[s] = i
    for i in range(0x40, 0x1000):
        hi, lo = divmod(i, 0x80)
        s = chr(hi + 0xC0) + chr(lo)
        _encoding[i] = s
        _decoding[s] = i

_fill()

def test():
    for i in range(2**20):
        if i % 1000 == 0: print i
        wids = [i]
        code = encode(wids)
        assert decode(code) == wids, (wids, code, decode(code))

if __name__ == "__main__":
    test()


=== Products/ZCTextIndex/IQueryParser.py 1.1.2.4 => 1.1.2.5 ===
         """Return the node type.
 
-        This is one of 'AND', 'OR', 'NOT', or 'ATOM'.
+        This is one of 'AND', 'OR', 'NOT', 'ATOM' or 'PHRASE'.
         """
 
     def getValue():
@@ -44,6 +44,7 @@
         'OR'              a list of parse trees
         'NOT'             a parse tree
         'ATOM'            a string (representing a single search term)
+        'PHRASE'          a string (representing a search phrase)
         """
 
     def terms():


=== Products/ZCTextIndex/Index.py 1.1.2.37 => 1.1.2.38 ===
 
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
 
 from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
 
 # Instead of storing floats, we generally store scaled ints.  Binary pickles
 # can store those more efficiently.  The default SCALE_FACTOR of 1024
@@ -94,7 +96,7 @@
         for i in range(len(uniqwids)):
             self._add_wordinfo(uniqwids[i], freqs[i], docid)
         self._docweight[docid] = docweight
-        self._add_undoinfo(docid, uniqwids)
+        self._add_undoinfo(docid, wids)
 
     def unindex_doc(self, docid):
         for wid in self._get_undoinfo(docid):
@@ -104,6 +106,9 @@
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
+        return self.search_wids(wids)
+
+    def search_wids(self, wids):
         if not wids:
             return IIBucket()
         N = float(len(self._docweight))
@@ -117,9 +122,23 @@
                 d2w = IIBucket(d2w)
             L.append((d2w, scaled_int(idf)))
         L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+        d2w, weight = L[0]
+        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+        for d2w, weight in L[1:]:
+            dummy, result = weightedIntersection(result, d2w, 1, weight)
+        return result
+
+    def search_phrase(self, phrase):
+        wids = self._lexicon.termToWordIds(phrase)
+        hits = self.search_wids(wids)
+        if not hits:
+            return hits
+        code = WidCode.encode(wids)
         result = IIBTree()
-        for d2w, weight in L:
-            dummy, result = weightedUnion(result, d2w, 1, weight)
+        for docid, weight in hits.items():
+            docwords = self._docwords[docid]
+            if docwords.find(code) >= 0:
+                result[docid] = weight
         return result
 
     def query_weight(self, terms):
@@ -196,8 +215,11 @@
         self._wordinfo[wid] = map # Not redundant, because of Persistency!
 
     def _del_wordinfo(self, wid, docid):
-        map = self._wordinfo[wid]
-        del map[docid]
+        try:
+            map = self._wordinfo[wid]
+            del map[docid]
+        except KeyError:
+            return
         if len(map) == 0:
             del self._wordinfo[wid]
             return
@@ -209,10 +231,10 @@
         self._wordinfo[wid] = map # Not redundant, because of Persistency!
 
     def _add_undoinfo(self, docid, wids):
-        self._docwords[docid] = wids
+        self._docwords[docid] = WidCode.encode(wids)
 
     def _get_undoinfo(self, docid):
-        return self._docwords[docid]
+        return WidCode.decode(self._docwords[docid])
 
     # The rest are helper methods to support unit tests
 


=== Products/ZCTextIndex/ParseTree.py 1.1.2.2 => 1.1.2.3 ===
 
     def executeQuery(self, index):
-            return index.search(self.getValue())
+        return index.search(self.getValue())
+
+class PhraseNode(AtomNode):
+
+    _nodeType = "PHRASE"
+
+    def executeQuery(self, index):
+        return index.search_phrase(self.getValue())


=== Products/ZCTextIndex/QueryParser.py 1.1.2.13 => 1.1.2.14 ===
                 tree = ParseTree.AtomNode(atoms[0])
             else:
-                tree = ParseTree.OrNode([ParseTree.AtomNode(t) for t in atoms])
+                tree = ParseTree.PhraseNode(" ".join(atoms))
         return tree