[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.6

Andreas Jung andreas@zope.com
Mon, 7 Jan 2002 14:18:20 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv18973

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
+ works not with ZCatalog
+ support for either internal storage of word positions or lookup
  of word positions on-the-fly when document is in resultlist



=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.5 => 1.2.2.6 ===
     _valid_default_operators    =  ('and','or')
  
+
     def __init__(self 
                  , id 
                  , extra= None
@@ -68,17 +69,42 @@
 
         self.id            = id
 
+        # reference to catalog (can we get rid of that ?)
+        self.catalog       = caller
+
+        # name of splitter        
         self.useSplitter   = getattr(extra,'useSplitter',   'ZopeSplitter')
+
+        # name of stemmer or None
         self.useStemmer    = getattr(extra,'useStemmer',    None)
+
+        # default operator to combine queries
         self.useOperator   = getattr(extra,'useOperator',   'and')
+
+        # support globbing: 1/0
         self.useGlobbing   = getattr(extra,'useGlobbing',   1)
+
+        # lexicon to be used (name, reference or None(internal))
         self.lexicon       = getattr(extra,'lexicon',       None)
+
+        # support near search: 1/0 (requires more storage)
         self.useNearSearch = getattr(extra,'useNearSearch', 1)
+
+        # default maximum distance for words with near search
         self.nearDistance  = getattr(extra,'nearDistance',  5)
+
+        # use proximity algorithm
         self.useProximity  = getattr(extra,'useProximity',  None)
 
-        if self.lexicon == 'None':     self.lexicon    = None
+        # storage of positions for near search ('internal','documentLookup')
+        self.nearStorage   = getattr(extra,'nearStorage',  'internal')
+
+        if self.lexicon == 'None':    self.lexicon    = None
         if self.useStemmer == 'None': self.useStemmer = None
+    
+        if not self.nearStorage in ('internal','documentLookup'):
+            raise ValueError,'nearStorage must be either "internal"'\
+                             ' or "documentLookup"'
 
         self.clear()
                         
@@ -88,6 +114,15 @@
         self._IDX     = IOBTree()
         self._invIDX  = IOBTree()
 
+        # near Search
+        if self.nearStorage == 'internal':
+            self.positions = self.positionsFromInternalStorage
+            self.insertForwardEntry = self.insertForwardEntryInternal
+        else:
+            self.positions = self.positionsFromDocumentLookup
+            self.insertForwardEntry = self.insertForwardEntryDocumentLookup
+
+    
         # get splitter function
         self._splitterfunc = self._stemmerfunc = None
 
@@ -147,8 +182,10 @@
             idx[documentId].update(widLst)
 
 
-    def insertForwardEntry(self,wordId,pos,documentId):
-        """ insert entries for forward index """
+    def insertForwardEntryInternal(self,wordId,pos,documentId):
+        """ insert entries for forward index. This function stores
+            the word positions internally.
+        """
 
         # self._IDX is a mapping:
         # wordId -> documentId -> [positions]
@@ -165,6 +202,23 @@
         tree[documentId].insert(pos)
 
 
+    def insertForwardEntryDocumentLookup(self,wordId,pos,documentId):
+        """ insert entries for forward index. This function does not store
+            word positions. Word positions are calculated when document is in the 
+            hitlist.
+        """
+
+        # self._IDX is a mapping:
+        # wordId -> documentId -> [positions]
+        
+        idx = self._IDX
+
+        if idx.has_key(wordId) == 0:
+            idx[wordId] = IISet()
+
+        idx[wordId].insert(documentId)
+
+
     def _printIndex(self):
 
         for wordId in self._IDX.keys():
@@ -223,7 +277,9 @@
 
             pos+=1
 
-        self.insertBackwardEntries(self,widLst,documentId)
+        self.insertBackwardEntries(widLst,documentId)
+
+        return len(widLst)
 
 
     def unindex_object(self, documentId): 
@@ -281,7 +337,7 @@
 
             else:
                 r={}
-
+        
             return ResultListNG(r, (word,), self)
 
 
@@ -323,7 +379,7 @@
             if not key:
                 continue
 
-            b = self.query(key, query_operator)
+            b = self.query(key, query_operator).keys()
             w, r = weightedIntersection(r, b)
 
         if r is not None:
@@ -332,7 +388,41 @@
         return (IIBucket(), (self.id,))
 
 
-    def positions(self,docId, words):
+    def positionsByDocumentLookup(self,docId, words):
+        """ search all positions for a list of words for
+            a given document given by its documentId.
+            positions() returns a mapping word to
+            list of positions of the word inside the document.
+        """
+
+        debug('searching positions docid: %s, words: %s' % (docId,words))
+
+        res = OOBTree()
+
+        # obtain object from ZCatalog
+        # this implementation must be changed for ZCatalog usage
+        # (for testing purposes we read from the filesystem)
+
+        uid = self.catalog.paths[docId]
+        data = open(uid).read()   # CHANGE THAT !!!
+
+        # Split retrieved document and obtain list of word positions
+
+        SP = self._splitterfunc(data)
+
+        for word in words:
+            posLst = SP.indexes(word)        
+
+            res[word] = IISet(posLst)
+      
+        for k,v in  res.items():
+            debug(k,v)
+
+        return res
+
+
+
+    def positionsFromInternalStorage(self,docId, words):
         """ search all positions for a list of words for
             a given document given by its documentId.
             positions() returns a mapping word to