[Zope-Checkins] SVN: Zope/trunk/ More catalog work, merge in the non-queryplan and non-btree changes of queryplan and optimize more of date range index internals

Hanno Schlichting hannosch at hannosch.eu
Sun Jul 25 18:03:14 EDT 2010


Log message for revision 115092:
  More catalog work, merge in the non-queryplan and non-btree changes of queryplan and optimize more of date range index internals
  

Changed:
  U   Zope/trunk/doc/CHANGES.rst
  U   Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py
  U   Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py
  U   Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py
  U   Zope/trunk/src/Products/PluginIndexes/interfaces.py
  U   Zope/trunk/src/Products/ZCatalog/Catalog.py

-=-
Modified: Zope/trunk/doc/CHANGES.rst
===================================================================
--- Zope/trunk/doc/CHANGES.rst	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/doc/CHANGES.rst	2010-07-25 22:03:14 UTC (rev 115092)
@@ -56,6 +56,17 @@
 Features Added
 ++++++++++++++
 
+- Various optimizations to indexes _apply_index and the catalog's search
+  method inspired by experimental.catalogqueryplan.
+
+- Added a new ILimitedResultIndex to Products.PluginIndexes and made most
+  built-in indexes compatible with it. This allows indexes to consider the
+  already calculated result set inside their own calculations.
+
+- Changed the internals of the DateRangeIndex to always use IITreeSet and do
+  an inline migration from IISet. Some datum tend to have large number of
+  documents, for example when using default floor or ceiling dates.
+
 - Added a new reporting tab to `Products.ZCatalog` instances. You can use this
   to get an overview of slow catalog queries, as specified by a configurable
   threshold value. The reports are per running Zope process.

Modified: Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py
===================================================================
--- Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py	2010-07-25 22:03:14 UTC (rev 115092)
@@ -157,7 +157,7 @@
 
         return returnStatus
 
-    def _apply_index(self, request):
+    def _apply_index(self, request, resultset=None):
         """Apply the index to query parameters given in the argument
 
         Normalize the 'query' arguments into integer values at minute
@@ -176,7 +176,7 @@
         #experimental code for specifing the operator
         operator = record.get( 'operator', self.useOperator )
         if not operator in self.operators :
-            raise RuntimeError, "operator not valid: %s" % operator
+            raise RuntimeError("operator not valid: %s" % operator)
 
         # depending on the operator we use intersection or union
         if operator=="or":
@@ -223,6 +223,9 @@
                 if set is not None:
                     if isinstance(set, int):
                         set = IISet((set,))
+                    else:
+                        # set can't be bigger than resultset
+                        set = intersection(set, resultset)
                     r = set_func(r, set)
 
         if isinstance(r, int):

Modified: Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py
===================================================================
--- Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py	2010-07-25 22:03:14 UTC (rev 115092)
@@ -28,7 +28,6 @@
 from BTrees.IIBTree import IITreeSet
 from BTrees.IIBTree import intersection
 from BTrees.IIBTree import multiunion
-from BTrees.IIBTree import union
 from BTrees.IOBTree import IOBTree
 from BTrees.Length import Length
 from DateTime.DateTime import DateTime
@@ -242,7 +241,7 @@
 
         return tuple( result )
 
-    def _apply_index(self, request):
+    def _apply_index(self, request, resultset=None):
         """
             Apply the index to query parameters given in 'request', which
             should be a mapping object.
@@ -265,34 +264,17 @@
         #   Aggregate sets for each bucket separately, to avoid
         #   large-small union penalties.
         #
-        #until_only  = IISet()
-        #map( until_only.update, self._until_only.values( term ) )
-        # XXX use multi-union
         until_only = multiunion( self._until_only.values( term ) )
-
-        #since_only  = IISet()
-        #map( since_only.update, self._since_only.values( None, term ) )
-        # XXX use multi-union
         since_only = multiunion( self._since_only.values( None, term ) )
-
-        #until       = IISet()
-        #map( until.update, self._until.values( term ) )
-        # XXX use multi-union
         until = multiunion( self._until.values( term ) )
 
-        #since       = IISet()
-        #map( since.update, self._since.values( None, term ) )
-        # XXX use multi-union
-        since = multiunion( self._since.values( None, term ) )
+        # Total result is bound by resultset
+        until = intersection(resultset, until)
+        since = multiunion(self._since.values(None, term))
+        bounded = intersection(until, since)
 
-        bounded     = intersection( until, since )
-
         #   Merge from smallest to largest.
-        #result      = union( self._always, until_only )
-        result      = union( bounded, until_only )
-        result      = union( result, since_only )
-        #result      = union( result, bounded )
-        result      = union( result, self._always )
+        result = multiunion([bounded, until_only, since_only, self._always])
 
         return result, ( self._since_field, self._until_field )
 
@@ -314,8 +296,8 @@
             if set is None:
                 self._until_only[ until ] = documentId
             else:
-                if isinstance(set, int):
-                    set = self._until_only[ until ] = IISet((set, documentId))
+                if isinstance(set, (int, IISet)):
+                    set = self._until_only[until] = IITreeSet((set, documentId))
                 else:
                     set.insert( documentId )
         elif until is None:
@@ -324,8 +306,8 @@
             if set is None:
                 self._since_only[ since ] = documentId
             else:
-                if isinstance(set, int):
-                    set = self._since_only[ since ] = IISet((set, documentId))
+                if isinstance(set, (int, IISet)):
+                    set = self._since_only[since] = IITreeSet((set, documentId))
                 else:
                     set.insert( documentId )
 
@@ -335,8 +317,8 @@
             if set is None:
                 self._since[ since ] = documentId
             else:
-                if isinstance(set, int):
-                    set = self._since[ since ] = IISet((set, documentId))
+                if isinstance(set, (int, IISet)):
+                    set = self._since[since] = IITreeSet((set, documentId))
                 else:
                     set.insert( documentId )
 
@@ -344,8 +326,8 @@
             if set is None:
                 self._until[ until ] = documentId
             else:
-                if isinstance(set, int):
-                    set = self._until[ until ] = IISet((set, documentId))
+                if isinstance(set, (int, IISet)):
+                    set = self._until[until] = IITreeSet((set, documentId))
                 else:
                     set.insert( documentId )
 

Modified: Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py
===================================================================
--- Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py	2010-07-25 22:03:14 UTC (rev 115092)
@@ -21,7 +21,7 @@
 from BTrees.IIBTree import intersection
 from BTrees.IIBTree import IITreeSet
 from BTrees.IIBTree import IISet
-from BTrees.IIBTree import union
+from BTrees.IIBTree import multiunion
 from BTrees.IOBTree import IOBTree
 from BTrees.Length import Length
 from BTrees.OOBTree import OOBTree
@@ -31,7 +31,7 @@
 
 from Products.PluginIndexes.common import safe_callable
 from Products.PluginIndexes.common.util import parseIndexRequest
-from Products.PluginIndexes.interfaces import IPluggableIndex
+from Products.PluginIndexes.interfaces import ILimitedResultIndex
 from Products.PluginIndexes.interfaces import ISortIndex
 from Products.PluginIndexes.interfaces import IUniqueValueIndex
 
@@ -43,7 +43,7 @@
 
     """Simple forward and reverse index.
     """
-    implements(IPluggableIndex, IUniqueValueIndex, ISortIndex)
+    implements(ILimitedResultIndex, IUniqueValueIndex, ISortIndex)
 
     def __init__(
         self, id, ignore_ex=None, call_methods=None, extra=None, caller=None):
@@ -302,7 +302,7 @@
             LOG.debug('Attempt to unindex nonexistent document'
                       ' with id %s' % documentId,exc_info=True)
 
-    def _apply_index(self, request):
+    def _apply_index(self, request, resultset=None):
         """Apply the index to query parameters given in the request arg.
 
         The request argument should be a mapping object.
@@ -348,12 +348,8 @@
         # experimental code for specifing the operator
         operator = record.get('operator',self.useOperator)
         if not operator in self.operators :
-            raise RuntimeError,"operator not valid: %s" % escape(operator)
+            raise RuntimeError("operator not valid: %s" % escape(operator))
 
-        # depending on the operator we use intersection or union
-        if operator=="or":  set_func = union
-        else:               set_func = intersection
-
         # Range parameter
         range_parm = record.get('range',None)
         if range_parm:
@@ -375,24 +371,84 @@
             if 'max' in opr_args: hi = max(record.keys)
             else: hi = None
             if hi:
-                setlist = index.items(lo,hi)
+                setlist = index.values(lo,hi)
             else:
-                setlist = index.items(lo)
+                setlist = index.values(lo)
 
-            for k, set in setlist:
-                if isinstance(set, int):
-                    set = IISet((set,))
-                r = set_func(r, set)
+            # If we only use one key, intersect and return immediately
+            if len(setlist) == 1:
+                result = setlist[0]
+                if isinstance(result, int):
+                    result = IISet((result,))
+                return result, (self.id,)
+
+            if operator == 'or':
+                tmp = []
+                for s in setlist:
+                    if isinstance(s, int):
+                        s = IISet((s,))
+                    tmp.append(s)
+                r = multiunion(tmp)
+            else:
+                # For intersection, sort with smallest data set first
+                tmp = []
+                for s in setlist:
+                    if isinstance(s, int):
+                        s = IISet((s,))
+                    tmp.append(s)
+                if len(tmp) > 2:
+                    setlist = sorted(tmp, key=len)
+                else:
+                    setlist = tmp
+                r = resultset
+                for s in setlist:
+                    # the result is bound by the resultset
+                    r = intersection(r, s)
+
         else: # not a range search
-            for key in record.keys:
-                set=index.get(key, None)
-                if set is None:
-                    set = IISet(())
-                elif isinstance(set, int):
-                    set = IISet((set,))
-                r = set_func(r, set)
+            # Filter duplicates
+            setlist = []
+            for k in record.keys:
+                s = index.get(k, None)
+                # If None, try to bail early
+                if s is None:
+                    if operator == 'or':
+                        # If union, we can't possibly get a bigger result
+                        continue
+                    # If intersection, we can't possibly get a smaller result
+                    return IISet(), (self.id,)
+                elif isinstance(s, int):
+                    s = IISet((s,))
+                setlist.append(s)
 
-        if isinstance(r, int):  r=IISet((r,))
+            # If we only use one key return immediately
+            if len(setlist) == 1:
+                result = setlist[0]
+                if isinstance(result, int):
+                    result = IISet((result,))
+                return result, (self.id,)
+
+            if operator == 'or':
+                # If we already get a small result set passed in, intersecting
+                # the various indexes with it and doing the union later is
+                # faster than creating a multiunion first.
+                if resultset is not None and len(resultset) < 200:
+                    smalllist = []
+                    for s in setlist:
+                        smalllist.append(intersection(resultset, s))
+                    r = multiunion(smalllist)
+                else:
+                    r = multiunion(setlist)
+            else:
+                # For intersection, sort with smallest data set first
+                if len(setlist) > 2:
+                    setlist = sorted(setlist, key=len)
+                r = resultset
+                for s in setlist:
+                    r = intersection(r, s)
+
+        if isinstance(r, int):
+            r = IISet((r, ))
         if r is None:
             return IISet(), (self.id,)
         else:

Modified: Zope/trunk/src/Products/PluginIndexes/interfaces.py
===================================================================
--- Zope/trunk/src/Products/PluginIndexes/interfaces.py	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/src/Products/PluginIndexes/interfaces.py	2010-07-25 22:03:14 UTC (rev 115092)
@@ -85,6 +85,15 @@
         """Empty the index"""
 
 
+class ILimitedResultIndex(IPluggableIndex):
+
+    def _apply_index(request, resultset=None):
+        """Same as IPluggableIndex' _apply_index method. The additional
+        resultset argument contains the resultset, as already calculated by
+        ZCatalog's search method.
+        """
+
+
 class IUniqueValueIndex(IPluggableIndex):
     """An index which can return lists of unique values contained in it"""
 

Modified: Zope/trunk/src/Products/ZCatalog/Catalog.py
===================================================================
--- Zope/trunk/src/Products/ZCatalog/Catalog.py	2010-07-25 18:09:26 UTC (rev 115091)
+++ Zope/trunk/src/Products/ZCatalog/Catalog.py	2010-07-25 22:03:14 UTC (rev 115092)
@@ -22,6 +22,7 @@
 import ExtensionClass
 from Missing import MV
 from Persistence import Persistent
+from Products.PluginIndexes.interfaces import ILimitedResultIndex
 
 import BTrees.Length
 from BTrees.IIBTree import intersection, weightedIntersection, IISet
@@ -475,6 +476,10 @@
                     query[iid] = value
         return query
 
+    def _sorted_search_indexes(self, query):
+        # Simple implementation doing no ordering.
+        return self.indexes.keys()
+
     def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
         """Iterate through the indexes, applying the query to each one. If
         merge is true then return a lazy result set (sorted if appropriate)
@@ -497,25 +502,44 @@
 
         # Canonicalize the request into a sensible query before passing it on
         query = self.make_query(query)
+        query_keys = query.keys()
 
         cr = self.getCatalogReport(query)
         cr.start()
 
-        for i in self.indexes.keys():
+        for i in self._sorted_search_indexes(query):
+            if i not in query_keys:
+                # Do not ask indexes to restrict the result, which aren't
+                # part of the query
+                continue
+
             index = self.getIndex(i)
             _apply_index = getattr(index, "_apply_index", None)
             if _apply_index is None:
                 continue
 
+            limit_result = False
+            if ILimitedResultIndex.providedBy(index):
+                limit_result = True
+
             cr.split(i)
-            r = _apply_index(query)
+            if limit_result:
+                r = _apply_index(query, rs)
+            else:
+                r = _apply_index(query)
             cr.split(i)
 
             if r is not None:
                 r, u = r
+                # Short circuit if empty result
+                # BBB: We can remove the "r is not None" check in Zope 2.14
+                # once we don't need to support the "return everything" case
+                # anymore
+                if r is not None and not r:
+                    return LazyCat([])
                 w, rs = weightedIntersection(rs, r)
                 if not rs:
-                   break
+                    break
 
         cr.stop()
 



More information about the Zope-Checkins mailing list