[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - GlobbingLexiconNG.py:1.1.2.1 LexiconNG.py:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 9 Jan 2002 11:25:00 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv24005

Added Files:
      Tag: ajung-textindexng-branch
	GlobbingLexiconNG.py LexiconNG.py 
Log Message:
created local copies to get rid of old crap. Splitter and stop words handling 
now occurs in the TextIndex.


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/GlobbingLexiconNG.py ===
##############################################################################
# 
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
#############################################################################

import re, string

from LexiconNG import LexiconNG
from Products.PluginIndexes.TextIndex.randid import randid

from BTrees.IIBTree import IISet, union, IITreeSet
from BTrees.OIBTree import OIBTree
from BTrees.IOBTree import IOBTree
from BTrees.OOBTree import OOBTree

from TextOperators import Or,Op


class GlobbingLexiconNG(LexiconNG):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'


    def __init__(self):
        self.clear()


    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()


    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        digrams = list(word)
        digrams.append(self.eow)
        last = self.eow

        for i in range(len(digrams)):
            last, digrams[i] = digrams[i], last + digrams[i]

        return digrams


    def getWordIdList(self,words):
        """ return a list a wordIds for a list of words """
        return [ self.getWordId(word)   for word in words] 

    
    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId                     # Kludge for old code


    def getWord(self, wid):
        return self._inverseLex.get(wid, None)


    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        inverse=self._inverseLex
        while not inverse.insert(wid, word):
            wid=randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.

        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)

            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid

    
    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern)==1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]) )
                digrams.append((pattern[i] + pattern[i+1]))
            else:
                try:
                    if pattern[i+1] not in wc_set:
                        digrams.append( pattern[i] + pattern[i+1] )

                except IndexError:
                    digrams.append( (pattern[i] + self.eow) )

        if not globbing:
            result =  self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )
        
        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result=union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits

                
    def __getitem__(self, word):
        """ """
        return self.get(word)


    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ( (self.multi_wc in e) or
                   (self.single_wc in e) ):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q


    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        transTable = string.maketrans("", "")
        result = string.translate(pat, transTable,
                                  r'()&|!@#$%^{}\<>.')
        
        # First, deal with multi-character globbing
        result = result.replace( '*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace( '?', '.')

        return "%s$" % result 




=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/LexiconNG.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

__doc__=""" Module breaks out Zope specific methods and behavior.  In
addition, provides the Lexicon class which defines a word to integer
mapping.
"""

from Persistence import Persistent
from Acquisition import Implicit

from BTrees.OIBTree import OIBTree
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IISet, IITreeSet

from Products.PluginIndexes.TextIndex.randid import randid
from types import StringType


class LexiconNG(Persistent, Implicit):
    """Maps words to word ids and then some

    The Lexicon object is an attempt to abstract vocabularies out of
    Text indexes.  This abstraction is not totally cooked yet, this
    module still includes the parser for the 'Text Index Query
    Language' and a few other hacks.

    """

    def __init__(self): 
        self.clear()


    def clear(self):
        self._lexicon    = OIBTree()
        self._inverseLex = IOBTree()
        

    def getWordIdList(self,words):
        """ return a list a wordIds for a list of words """
        return [ self.getWordId(word)   for word in words] 
        

    def getWordId(self, word):
        """ return the word id of 'word' """

        wid=self._lexicon.get(word, None)
        if wid is None: 
            wid=self.assignWordId(word)
        return wid
        
    set = getWordId

    def getWord(self, wid):
        """ post-2.3.1b2 method, will not work with unconverted lexicons """
        return self._inverseLex.get(wid, None)
        

    def assignWordId(self, word):
        """Assigns a new word id to the provided word and returns it."""

        # First make sure it's not already in there

        if self._lexicon.has_key(word):
            return self._lexicon[word]


        wid = randid()
        while not self._inverseLex.insert(wid, word):
            wid=randid()

        if isinstance(word,StringType):        
            self._lexicon[intern(word)] = wid
        else:
            self._lexicon[word] = wid
            

        return wid


    def get(self, key, default=None):
        """Return the matched word against the key."""

        r=IISet()

        wid=self._lexicon.get(key, default)
        if wid is not None: r.insert(wid)

        return r

    def __getitem__(self, key):
        return self.get(key)


    def __len__(self):
        return len(self._lexicon)

    def query_hook(self, q):
        """ we don't want to modify the query cuz we're dumb """
        return q