[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - Normalizer.py:1.1.2.1

Andreas Jung andreas@digicool.com
Sun, 13 Jan 2002 13:21:32 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv12409

Added Files:
      Tag: ajung-textindexng-branch
	Normalizer.py 
Log Message:
added


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/Normalizer.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

__doc__ =""" 
Word normalizer. A normalizer takes a word and translates its characters
according to a translation table. The functionality is similiar to 
string.translate() but allows to translate multiple characters. 
A normalizer is typically used to translate accented characters to ASCII.
"""

from BTrees.OOBTree import OOBTree
from types import DictType, StringType
import re, os

_basedir = os.path.dirname(__file__)


class Normalizer:
    """ word normalizer """

    def __init__(self, arg):
        self.clear()

        if isinstance(arg,DictType):
            self._trans.update(arg)

        elif isinstance(arg,StringType):
            self._trans.update (self.readTranslationTable(arg) )

        else: 
            raise ValueError

        self.keys       = self._trans.keys
        self.values     = self._trans.values
        self.items      = self._trans.items
        self.has_key    = self._trans.has_key
        self.get        = self._trans.get

    def __len__(self):      return len(self._trans)


    def clear(self):
        self._trans = OOBTree()
        self._order = []


    def normalize(self, word):
        """ normalize the word using the given translation table. This
            functionality *MUST* go into a C extension for performance
            reasons !!!
        """

        for token in self._order:
            word = word.replace(token, self._trans[token])

        return word

    __call__ = normalize


    def  readTranslationTable(self, fname):
        """  read a translation table """

        def __ordersort(a,b):    return  cmp(len(a),len(b))
    
        d = {}

        try:        
            f = os.path.join(_basedir,'normalizers',fname) 
            print f
            lines = open(f).readlines()
        except:
            try: lines = open(fname).readlines()
            except: raise

        for l in lines: 
            l      = l.strip()
            fields = l.split()

            d[fields[0]] = fields[1]


        self._order = d.keys()
        self._order.sort(__ordersort)
        self._order.reverse()

        return d