[Zope3-checkins] CVS: zopeproducts/lucenetextindex/server - INSTALL.txt:1.1 README.txt:1.1 indexserver.py:1.1 run:1.1 test_server.py:1.1

Stephan Richter srichter@cosmos.phy.tufts.edu
Tue, 29 Jul 2003 09:04:10 -0400


Update of /cvs-repository/zopeproducts/lucenetextindex/server
In directory cvs.zope.org:/tmp/cvs-serv15566/server

Added Files:
	INSTALL.txt README.txt indexserver.py run test_server.py 
Log Message:
I am proud to announce the first check-in of the Lucene Text Index for 
Zope 3. The code was fully sponsored by struktur AG (www.struktur.de).

In the next week I expect to get some changes from struktur, since they
have done some stress testing and I will maintain the code. This is a good
example on how to use the Index (especially the TextIndex) API and make 
your own implementations.


=== Added File zopeproducts/lucenetextindex/server/INSTALL.txt ===
Installation
============

  - Install Jython 2.1; http://www.jython.org/download.html

  - Install Lucene 1.3 RC1; http://jakarta.apache.org/lucene/docs/index.html

  - Install XML-RPC Server; http://ws.apache.org/xmlrpc/download.html

  - Edit the 'run' script

    o JYTHON: This is the path to the Jython **executable**.

    o INDEX: Directory in which the index is stored.

    o PORT: The port under which the XML-RPC server will be available.

    o LUCENE: Path to the Lucene Library (JAR) file.

    o XMLRPC: Path to the XML-RPC server Library (JAR) file.

  - Execute './run'.

=== Added File zopeproducts/lucenetextindex/server/README.txt ===
Lucene XML-RPC Text Index Server
================================

  This is a Lucene XML-RPC server that can be used to index and query document
  contents. While not very flexible, this Jython application is a quick way
  for non-Java applications to use Lucene's facilities.
  

=== Added File zopeproducts/lucenetextindex/server/indexserver.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""A simple Lucene Text Index Server.

This module contains a simple text index server that uses Lucene as
backend. The network communication protocol is simply XML-RPC.  

Note: This is a Jython application.

$Id: indexserver.py,v 1.1 2003/07/29 13:04:02 srichter Exp $
"""
import os
import sys
import time

from java.util import Vector

from org.apache.xmlrpc import WebServer
from org.apache.xmlrpc import XmlRpcHandler

from org.apache.lucene.analysis import StopAnalyzer, SimpleAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexReader, IndexWriter, Term
from org.apache.lucene.search import IndexSearcher, Query, Hits, TermQuery
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryParser import QueryParser

# Jython is not yet Python 2.2 compatible.
True = 1
False = 0

class TrivialLogger:

    def log(self, msg):
        print msg


class LuceneXMLRPCHandler(XmlRpcHandler):
    """Lucene XML-RPC command handler.

    This handler object expects a path (the location of the index) as
    constructor argument.
    """

    def __init__(self, indexPath):
        """Instantiate the handler object."""
        self.indexPath = indexPath
        self.log = TrivialLogger()
        self.analyzer = StopAnalyzer()

        # Make sure the path exists
        if not os.path.exists(self.indexPath):
            os.mkdir(self.indexPath)

        if not os.path.exists(os.path.join(self.indexPath, 'segments')):
            self.log.log('Creating new index.')
            writer = IndexWriter(self.indexPath, self.analyzer, 1)
            writer.close()



    def execute(self, name, args):
        """See interface XmlRpcHandler."""
        # Simply redirect the request to the real method
        return getattr(self, 'xmlrpc_'+name)(*args)


    def xmlrpc_isIndexed(self, id):
        """Check wether a particular object is already indexed."""
        query = TermQuery(Term('id', id))
        
        searcher = IndexSearcher(self.indexPath)
        results = searcher.search(query)
        if results.length() > 0:
            return True
        return False

    def xmlrpc_insertDocument(self, id, text):
        """Index a new document.

        This method expects two arguments, the id and the text. The id serves
        as reference for the remote system, so that it can associate the query
        result with objects/documents/files. The text is the actual content to
        be indexed.  
        """
        # Create a document and add two fields to it. 
        doc = Document()
        doc.add(Field.Keyword('id', id))
        doc.add(Field.Text('text', text))

        # Write the document into the index.
        writer = IndexWriter(self.indexPath, self.analyzer, 0)
        writer.addDocument(doc)
        writer.optimize()
        writer.close()
        self.log.log('Insert Document: %s' %id)
        return True


    def xmlrpc_updateDocument(self, id, text):
        """Update a document.

        Updating of a document consists of deleting the old entry (document)
        and creating a new one. The arguments are the same as for
        'insertDocument'. 
        """
        self.xmlrpc_deleteDocument(id)
        result = self.xmlrpc_insertDocument(id, text)
        return result

    def xmlrpc_deleteDocument(self, id):
        """Unindex a document.

        Removes an indexed document using its external id.
        """
        ir = IndexReader.open(self.indexPath)
        # We need to built up a query for the id.
        query = TermQuery(Term('id', id))
        
        searcher = IndexSearcher(self.indexPath)
        results = searcher.search(query)
        # This is for the case that accidently a document was indexed twice,
        # which should never have happened. We really expect the id to be
        # unique!
        for hitid in range(results.length()):
            ir.delete(results.id(hitid))
        ir.close()
        self.log.log('Delete Document: %s' %id)
        return True

        
    def xmlrpc_query(self, query):
        """Query the index.

        Given a query, the returns a list of matching documents of the form
        (id, score). 
        """
        # XXX: query parser is not thread safe
        query = QueryParser.parse(query, 'text', self.analyzer)
        self.log.log('Query: ' + query.toString())
        
        searcher = IndexSearcher(self.indexPath)
        results = searcher.search(query)

        # Prepare the result for XML-RPC.
        # Note: I cannot use regular Python lists here, since the XML-RPC list
        # does not know how to encode them, so we need to use the lousy Vectors.
        ids = Vector()
        for hitid in range(results.length()):
            entry = Vector()
            entry.add(results.doc(hitid).getField('id').stringValue())
            entry.add(results.score(hitid))
            ids.add(entry)

        return ids


    def xmlrpc_getStatistics(self):
        """Return the statistics of the in the form (docCount, termCount)."""
        ir = IndexReader.open(self.indexPath)

        # Determine amount of terms. Yeah, it's a bit inefficient, but there
        # seems to be no other API call.
        terms = 0
        enum = ir.terms()
        while enum.next():
            terms += 1

        stats = Vector()
        stats.add(ir.numDocs())
        stats.add(terms)
        self.log.log('Statistics: %s' %`list(stats)`)
        return stats


if __name__ == '__main__':

    if len(sys.argv) < 3:
        print 'usage: searchserver.py port index_dir'
        sys.exit(1)

    web = WebServer(int(sys.argv[1]))
    web.addHandler('lucene', LuceneXMLRPCHandler(sys.argv[2]))
    print 'Starting Lucene XML-RPC Text Index Server on port %s' %sys.argv[1]
    web.start()


=== Added File zopeproducts/lucenetextindex/server/run ===
# Path to the Jython startup script
JYTHON=/usr/java/jython-2.1/jython

# Index Data directory
INDEX=/home/srichter/Zope3/Zope3-Lucene/indexdata

# XML-RPC Server Port
PORT=10080

# Path to the Lucene library
LUCENE=/home/srichter/Zope3/Zope3-Lucene/lucene-1.3-rc1/lucene-1.3-rc1.jar

# Path to the XML-RPC library
XMLRPC=/home/srichter/Zope3/Zope3-Lucene/xmlrpc-1.1/xmlrpc-1.1.jar

export CLASSPATH=$LUCENE:$XMLRPC

# Start the server
$JYTHON indexserver.py $PORT $INDEX


=== Added File zopeproducts/lucenetextindex/server/test_server.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Test the Lucene Server

Quick and dirty testing. The server must be running for these tests to be
successful.

$Id: test_server.py,v 1.1 2003/07/29 13:04:02 srichter Exp $
"""
import xmlrpclib

server = xmlrpclib.Server('http://localhost:10080/RPC2/')
server.lucene.insertDocument('1', 'This is sample text.')
server.lucene.insertDocument('2', 'It is cool to have another sample text.')
print server.lucene.query('text')
server.lucene.deleteDocument('1')
print server.lucene.query('text')
print server.lucene.query('blah')