[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.8

Andreas Jung andreas@zope.com
Tue, 8 Jan 2002 11:56:40 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv15264

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
added support for proximity indexing


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.7 => 1.2.2.8 ===
 from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
 from Products.PluginIndexes.TextIndex.GlobbingLexicon import GlobbingLexicon
-
 from Products.PluginIndexes.TextIndex import Splitter
+from ProximityLexicon import ProximityLexicon
 
 
 from types import IntType, StringType, UnicodeType, InstanceType
@@ -43,6 +43,7 @@
 from TextIndexCommon import *
 
 import Stemmer
+import Proximity
 import Thesaurus, StopWords
 
 
@@ -71,6 +72,8 @@
 
         self.id            = id
 
+        debug(extra)
+
         # reference to catalog (can we get rid of that ?)
         self.catalog       = caller
 
@@ -78,7 +81,7 @@
         self.useSplitter   = getattr(extra,'useSplitter',   'ZopeSplitter')
 
         # name of stemmer or None
-        self.useStemmer    = getattr(extra,'useStemmer',    None)
+        self.useStemmer    = getattr(extra,'useStemmer',    None) or None
 
         # default operator to combine queries
         self.useOperator   = getattr(extra,'useOperator',   'and')
@@ -87,7 +90,7 @@
         self.useGlobbing   = getattr(extra,'useGlobbing',   1)
 
         # lexicon to be used (name, reference or None(internal))
-        self.lexicon       = getattr(extra,'lexicon',       None)
+        self.lexicon       = getattr(extra,'lexicon',       None) or None
 
         # support near search: 1/0 (requires more storage)
         self.useNearSearch = getattr(extra,'useNearSearch', 1)
@@ -96,19 +99,17 @@
         self.nearDistance  = getattr(extra,'nearDistance',  5)
 
         # use proximity algorithm
-        self.useProximity  = getattr(extra,'useProximity',  None)
+        self.useProximity  = getattr(extra,'useProximity',  None) or None
 
         # storage of positions for near search ('internal','documentLookup')
         self.nearStorage   = getattr(extra,'nearStorage',  'internal')
 
         # Stopwords: either filename or StopWord object
-        self.stopWords     = getattr(extra,'stopWords',    None)
+        self.stopWords     = getattr(extra,'stopWords',    None) or None
      
         # Thesaurus: either filename or StopWord object
-        self.thesaurus     = getattr(extra,'thesaurus',    None)
+        self.thesaurus     = getattr(extra,'thesaurus',    None) or None
 
-        if self.lexicon == 'None':    self.lexicon    = None
-        if self.useStemmer == 'None': self.useStemmer = None
     
         if not self.nearStorage in ('internal','documentLookup'):
             raise ValueError,'nearStorage must be either "internal"'\
@@ -122,7 +123,7 @@
         self._IDX        = IOBTree()
         self._invIDX     = IOBTree()
         self._thesaurus  = None
-        self._stopwords  = StopWords.StopWords()
+        self._stopwords  = None
 
 
         # Thesaurus
@@ -131,31 +132,54 @@
         elif isinstance(self.thesaurus, InstanceType):
             self._thesaurus = self.thesaurus
 
+
         # StopWords
 
-        if isinstance(self.stopWords, StringType):
-            self._stopwords =  StopWords.StopWords(self.stopWords)
-        elif isinstance(self.stopWords, InstanceType):
-            self._stopwords = self.stopWords
+        if self.stopWords:
+            if isinstance(self.stopWords, StringType):
+                self._stopwords =  StopWords.StopWords(self.stopWords)
+            elif isinstance(self.stopWords, InstanceType):
+                self._stopwords = self.stopWords
+        else:
+            self._stopwords = {}
+
+    
+        # Proximity
+
+        if self.useProximity:
+            self._PROX_LEX    = ProximityLexicon(algorithm=self.useProximity)
+            self._PROX_IDX    = IOBTree()
+            self._invPROX_IDX = IOBTree()
+
+            # the selection of the proximity function must be more general
+            # in the future. This requires some more work on the Python
+            # Proximity extension
+
+            if self.useProximity=='soundex':
+                self._v_proximityfunc = Proximity.soundex
+            else:
+                raise RuntimeError,'unsupported soundex'
+
 
         # near Search
         if self.nearStorage == 'internal':
-            self.positions = self.positionsFromInternalStorage
-            self.insertForwardEntry = self.insertForwardEntryInternal
+            self._v_positions = self.positionsFromInternalStorage
+            self._v_insertForwardEntry = self.insertForwardEntryInternal
         else:
-            self.positions = self.positionsFromDocumentLookup
-            self.insertForwardEntry = self.insertForwardEntryDocumentLookup
+            self._v_positions = self.positionsFromDocumentLookup
+            self._v_insertForwardEntry = self.insertForwardEntryDocumentLookup
 
-    
         # get splitter function
-        self._splitterfunc = self._stemmerfunc = None
+        self._v_splitterfunc = self._v_stemmerfunc = None
 
         if self.useSplitter:
-            self._splitterfunc = Splitter.getSplitter(self.useSplitter)
+            self._v_splitterfunc = Splitter.getSplitter(self.useSplitter)
+
 
         # stemmer function
+
         if self.useStemmer:
-            self._stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem
+            self._v_stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem
 
         if self.lexicon:
 
@@ -169,15 +193,16 @@
                 self._LEXICON = GlobbingLexicon()
                 debug('created new globbing lexicon')
 
-                if self._stemmerfunc:
+                if self._v_stemmerfunc:
                     debug('stemming disabled because globbing enabled')
-                    self._stemmerfunc = None
+                    self._v_stemmerfunc = None
 
             else:
                 self._LEXICON = Lexicon()
                 debug('created new lexicon')
 
 
+        self._v_getWordIdList  = self._LEXICON.getWordIdList
         self._v_getWordId      = self._LEXICON.getWordId
         self._v_getWordById    = self._LEXICON.getWord
         self._v_getIdByWord    = self._LEXICON.get
@@ -206,41 +231,83 @@
             idx[documentId].update(widLst)
 
 
-    def insertForwardEntryInternal(self,wordId,pos,documentId):
+    def insertForwardEntryInternal(self,wordIds,pos,documentId):
         """ insert entries for forward index. This function stores
             the word positions internally.
+
+            wordId is either an integer or a list of integers 
         """
 
         # self._IDX is a mapping:
         # wordId -> documentId -> [positions]
-        
+
         idx = self._IDX
 
-        if idx.has_key(wordId) == 0:
-            idx[wordId] = IOBTree()
+        _single = 0
+        if isinstance(wordIds,IntType): 
+            wordIds = [wordIds]
+            _single = 1
 
-        tree = idx[wordId] 
-        if tree.has_key(documentId) == 0:
-            tree[documentId] = IISet() 
+        for i in range(len(wordIds)):
+            wordId = wordIds[i]
+        
+            if idx.has_key(wordId) == 0:
+                idx[wordId] = IOBTree()
 
-        tree[documentId].insert(pos)
+            tree = idx[wordId] 
+            if tree.has_key(documentId) == 0:
+                tree[documentId] = IISet() 
 
+            if _single:
+                tree[documentId].insert(pos)
+            else:
+                tree[documentId].insert(i)
 
-    def insertForwardEntryDocumentLookup(self,wordId,pos,documentId):
+
+    def insertForwardEntryDocumentLookup(self,wordIds,pos,documentId):
         """ insert entries for forward index. This function does not store
             word positions. Word positions are calculated when document is in the 
             hitlist.
+
+            wordId is either an integer or a list of integers 
         """
 
         # self._IDX is a mapping:
-        # wordId -> documentId -> [positions]
+        # wordId -> documentId 
         
         idx = self._IDX
 
-        if idx.has_key(wordId) == 0:
-            idx[wordId] = IISet()
+        if isinstance(wordIds,IntType): wordIds = [wordIds]
+
+        for wordId in wordIds:
+
+            if idx.has_key(wordId) == 0:
+                idx[wordId] = IISet()
 
-        idx[wordId].insert(documentId)
+            idx[wordId].insert(documentId)
+
+
+    def insertProximityEntries(self,wordIds,documentId):
+        """ insert forward *and* backword entries for proximity indexes """
+
+        idx = self._PROX_IDX
+        invidx = self._invPROX_IDX
+
+        if isinstance(wordIds,IntType): wordIds = [wordIds]
+
+        for wordId in wordIds:
+            
+            if idx.has_key(wordId) == 0:
+                idx[wordId] = IISet()
+
+            idx[wordId].insert(documentId)
+
+
+        if invidx.has_key(documentId)==0:
+            invidx[documentId] = IISet(wordIds)
+        else:
+            invidx[documentId].update(wordIds)
+        
 
 
     def _printIndex(self):
@@ -279,34 +346,43 @@
         # For performance reasons it might be better when the 
         # splitter returns the list of splitted words.
 
-        words = self._splitterfunc(source,encoding=encoding)
+        words = self._v_splitterfunc(source,encoding=encoding).split()
 
-        # we collect all wordIds for performance reasons in a list
-        # and update the backward index once instead of inserting
-        # every single wordId
+        # apply stopwords list 
+        # Maybe this should go into a C extension for performance reasons
 
-        widLst = []
         isStopWord = self._stopwords.has_key
+        words =  filter(lambda x,f=isStopWord: f(x)==0, words)   
 
-        pos = 0    
-        for word in words:
 
-            if isStopWord(word):  continue
+        # Check if we want proximity searches. If yes, we need to create
+        # a list containing the proximity representations of the of     
+        # the words
 
-            # stem the single word        
-            if self._stemmerfunc:
-                word = self._stemmerfunc(word)
+        if self.useProximity:
+            proximity_words =  self._v_proximityfunc(words)
+            proximity_widList = self._PROX_LEX.getWordIdList(proximity_words)
+            assert len(proximity_words)==len(proximity_widList)
 
-            # get (new) wordId for word
-            wid = self._v_getWordId(word)
-            widLst.append(wid)
+            self.insertProximityEntries(proximity_widList,documentId)
+       
+        # Stem all words in one run
 
-            # and insert the wordId, its position and the documentId 
-            # in the index
-            self.insertForwardEntry(wid,pos,documentId)
+        if self._v_stemmerfunc:
+            words = self._v_stemmerfunc(words)
 
-            pos+=1
+        # we collect all wordIds for performance reasons in a list
+        # and update the backward index once instead of inserting
+        # every single wordId
+
+
+        widLst = self._v_getWordIdList(words)
+        assert len(widLst)==len(words)
 
+        # insert forward entries 
+        self._v_insertForwardEntry(widLst,None,documentId)  
+
+        # insert backward entries
         self.insertBackwardEntries(widLst,documentId)
 
         return len(widLst)
@@ -353,8 +429,8 @@
 
             # We need to stem
 
-            if self._stemmerfunc:
-                word = self._stemmerfunc(word)
+            if self._v_stemmerfunc:
+                word = self._v_stemmerfunc(word)
 
             wids = self._v_getIdByWord(word)
 
@@ -371,7 +447,6 @@
                     for docId in res:
                         r[docId] = IISet()
 
-
             else:
                 r={}
         
@@ -445,7 +520,7 @@
 
         # Split retrieved document and obtain list of word positions
 
-        SP = self._splitterfunc(data)
+        SP = self._v_splitterfunc(data)
 
         for word in words:
             posLst = SP.indexes(word)