filtering of lucene searches by distributions basically implemented, needs to be...

[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

index 6a93c0086bbee28e22cbb05b5973c2ca087cf755..a083460972d7578e509fd3b46d23bed527fffd92 100644 (file)
--- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java
+++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java
@@ -10,6 +10,7 @@
  package eu.etaxonomy.cdm.api.service.search;
  
  import java.io.IOException;
+import java.util.Collection;
  
  import org.apache.log4j.Logger;
  import org.apache.lucene.analysis.Analyzer;
@@ -19,25 +20,31 @@ import org.apache.lucene.queryParser.ParseException;
  import org.apache.lucene.queryParser.QueryParser;
  import org.apache.lucene.search.BooleanClause;
  import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Filter;
  import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiCollector;
  import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Searcher;
  import org.apache.lucene.search.Sort;
  import org.apache.lucene.search.SortField;
  import org.apache.lucene.search.TermQuery;
  import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.grouping.GroupDocs;
+import org.apache.lucene.search.grouping.SearchGroup;
+import org.apache.lucene.search.grouping.TermAllGroupsCollector;
+import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
+import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
+import org.apache.lucene.search.grouping.TopGroups;
  import org.hibernate.Session;
+import org.hibernate.search.ProjectionConstants;
  import org.hibernate.search.Search;
  import org.hibernate.search.SearchFactory;
-import org.hibernate.search.engine.DocumentBuilder;
-import org.hibernate.search.reader.ReaderProvider;
-import org.hibernate.search.store.DirectoryProvider;
  
+import eu.etaxonomy.cdm.config.Configuration;
  import eu.etaxonomy.cdm.model.common.CdmBase;
  import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.name.NonViralName;
+import eu.etaxonomy.cdm.model.name.TaxonNameBase;
  import eu.etaxonomy.cdm.model.taxon.Taxon;
  import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  
@@ -49,16 +56,22 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase;
   */
  public class LuceneSearch {
  
+    protected String groupByField = "id";
+
+    public final static String ID_FIELD = "id";
+
      public static final Logger logger = Logger.getLogger(LuceneSearch.class);
  
      protected Session session;
  
-    protected Searcher searcher;
+    protected IndexSearcher searcher;
  
-    private SortField[] sortFields;
+    protected SortField[] sortFields;
  
      private Class<? extends CdmBase> directorySelectClass;
  
+    private Filter filter = null;
+
      protected Class<? extends CdmBase> getDirectorySelectClass() {
          return pushAbstractBaseTypeDown(directorySelectClass);
      }
@@ -66,13 +79,27 @@ public class LuceneSearch {
      /**
       * classFilter
       */
-    private Class<? extends CdmBase> clazz;
+    protected Class<? extends CdmBase> clazz;
  
  
      public Class<? extends CdmBase> getClazz() {
          return clazz;
      }
  
+    /**
+     * @return the filter
+     */
+    public Filter getFilter() {
+        return filter;
+    }
+
+    /**
+     * @param filter the filter to set
+     */
+    public void setFilter(Filter filter) {
+        this.filter = filter;
+    }
+
      /**
       * Sets the Class to use as filter criterion, in case the supplied Class equals the
       * <code>directorySelectClass</code> the Class is set to <code>null</code>
@@ -103,6 +130,16 @@ public class LuceneSearch {
  
      protected String[] highlightFields = new String[0];
  
+    private int maxDocsPerGroup = 10;
+
+
+    public int getMaxDocsPerGroup() {
+        return maxDocsPerGroup;
+    }
+
+    public void setMaxDocsPerGroup(int maxDocsPerGroup) {
+        this.maxDocsPerGroup = maxDocsPerGroup;
+    }
  
      /**
       * @param session
@@ -112,13 +149,20 @@ public class LuceneSearch {
           this.directorySelectClass = directorySelectClass;
      }
  
+    /**
+     * @param session
+     */
+    public LuceneSearch(Session session, String groupByField, Class<? extends CdmBase> directorySelectClass) {
+         this.session = session;
+         this.directorySelectClass = directorySelectClass;
+         this.groupByField = groupByField;
+    }
+
      /**
       * TODO the abstract base class DescriptionElementBase can not be used, so
       * we are using an arbitraty subclass to find the DirectoryProvider, future
       * versions of hibernate search my allow using abstract base classes see
-     * http
-     * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
-     * -a-given-class-in-java
+     * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
       *
       * @param type must not be null
       * @return
@@ -130,6 +174,9 @@ public class LuceneSearch {
          if (type.equals(TaxonBase.class)) {
              type = Taxon.class;
          }
+        if (type.equals(TaxonNameBase.class)) {
+            type = NonViralName.class;
+        }
          return type;
      }
  
@@ -140,9 +187,10 @@ public class LuceneSearch {
      /**
       * @return
       */
-    public Searcher getSearcher() {
+    public IndexSearcher getSearcher() {
          if(searcher == null){
              searcher = new IndexSearcher(getIndexReader());
+            searcher.setDefaultFieldSortScoring(true, true);
          }
          return searcher;
      }
@@ -152,12 +200,16 @@ public class LuceneSearch {
       */
      public IndexReader getIndexReader() {
          SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
+        IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
+        return reader;
+    }
  
-        DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
-        logger.info(directoryProviders[0].getDirectory().toString());
-
-        ReaderProvider readerProvider = searchFactory.getReaderProvider();
-        IndexReader reader = readerProvider.openReader(directoryProviders[0]);
+    /**
+     * @return
+     */
+    public IndexReader getIndexReaderFor(Class<? extends CdmBase> clazz) {
+        SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
+        IndexReader reader = searchFactory.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz));
          return reader;
      }
  
@@ -166,7 +218,7 @@ public class LuceneSearch {
       */
      public QueryParser getQueryParser() {
          Analyzer analyzer = getAnalyzer();
-        QueryParser parser = new QueryParser("titleCache", analyzer);
+        QueryParser parser = new QueryParser(Configuration.luceneVersion,  "titleCache", analyzer);
          return parser;
      }
  
@@ -188,7 +240,7 @@ public class LuceneSearch {
       * @throws ParseException
       * @throws IOException
       */
-    public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+    public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
  
          Query luceneQuery = parse(luceneQueryString);
          this.query = luceneQuery;
@@ -208,15 +260,24 @@ public class LuceneSearch {
      }
  
      /**
-     * @param luceneQuery
-     * @param clazz the type as additional filter criterion
+     * @param maxNoOfHits
+     * @return
+     * @throws IOException
+     */
+    public TopDocs executeSearch(int maxNoOfHits) throws IOException {
+        Query fullQuery = expandQuery();
+        logger.info("lucene query string to be parsed: " + fullQuery.toString());
+        return getSearcher().search(fullQuery, filter, maxNoOfHits);
+
+    }
+    /**
       * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
       * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
       * @return
       * @throws ParseException
       * @throws IOException
       */
-    public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+    public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
  
  
          if(pageNumber == null || pageNumber < 0){
@@ -228,44 +289,59 @@ public class LuceneSearch {
          }
  
          Query fullQuery = expandQuery();
-
          logger.info("final query: " + fullQuery.toString());
  
-        int start = pageNumber * pageSize;
+        int offset = pageNumber * pageSize;
          int limit = (pageNumber + 1) * pageSize - 1 ;
+        logger.debug("start: " + offset + "; limit:" + limit);
  
-        logger.debug("start: " + start + "; limit:" + limit);
-
-        TopDocs topDocs;
+        // sorting
+        Sort groupSort = null;
+        Sort withinGroupSort = Sort.RELEVANCE;
          if(sortFields != null && sortFields.length > 0){
-            Sort sort = new Sort(sortFields);
-            topDocs = getSearcher().search(fullQuery, null, limit, sort);
+            if(sortFields[0] != SortField.FIELD_SCORE){
+                throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
+            }
+            groupSort = new Sort(sortFields);
          } else {
-            topDocs = getSearcher().search(fullQuery, null, limit);
+            groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
          }
  
+        // perform the search (needs two passes for grouping)
+        if(logger.isDebugEnabled()){
+            logger.debug("Grouping: sortFields=" + sortFields + ", groupByField=" + groupByField +
+                    ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
+        }
+        // - first pass
+        TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, withinGroupSort, limit);
  
-        //TODO when switched to Lucene 3.x which is included in hibernate 4.x
-        //     use TopDocCollector.topDocs(int start, int howMany);
-        //     since this method might be more memory save than our own implementation
-        //
-        //     ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
-        //
-//        TopDocs topDocs = hitCollector.topDocs();
-        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-
-        int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
-        logger.debug("docsAvailableInPage:" + docsAvailableInPage);
+        getSearcher().search(fullQuery, filter , firstPassCollector);
+        Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
  
-        ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
-        for(int i = 0; i < docsAvailableInPage; i++){
-            pagedDocs[i] = scoreDocs[start + i];
+        if (topGroups == null) {
+              return null;
+        }
+        // - second pass
+        boolean getScores = true;
+        boolean getMaxScores = true;
+        boolean fillFields = true;
+        TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
+        TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
+                groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields
+                );
+        getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
+
+        TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
+
+        // get max score from very first result
+        float maxScore = groupsResult.groups[0].maxScore;
+        if(logger.isDebugEnabled()){
+            logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
+                    ", totalGroupCount=" + allGroupsCollector.getGroupCount() + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
          }
-        TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
-        //
-        /////////////////////////////////////////////
+        TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
  
-        return pagedTopDocs;
+        return topGroupsWithMaxScore;
      }
  
      /**
@@ -277,7 +353,7 @@ public class LuceneSearch {
              BooleanQuery filteredQuery = new BooleanQuery();
              BooleanQuery classFilter = new BooleanQuery();
  
-            Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
+            Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
              TermQuery termQuery = new TermQuery(t);
  
              classFilter.setBoost(0);
@@ -316,11 +392,43 @@ public class LuceneSearch {
  
      public void setHighlightFields(String[] textFieldNamesAsArray) {
          this.highlightFields = textFieldNamesAsArray;
-
      }
  
      public String[] getHighlightFields() {
          return this.highlightFields;
      }
  
+    /**
+     * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
+     *
+     * @author a.kohlbecker
+     * @date Oct 4, 2012
+     *
+     */
+    public class TopGroupsWithMaxScore{
+        public TopGroups<String> topGroups;
+        public float maxScore = Float.NaN;
+
+        TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
+            this.maxScore = maxScore;
+            TopGroups<String> newTopGroups;
+            if(offset > 0){
+                GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
+                for(int i = offset; i < topGroups.groups.length; i++){
+                    newGroupDocs[i - offset] = topGroups.groups[i];
+                }
+                newTopGroups = new TopGroups<String>(
+                            topGroups.groupSort,
+                            topGroups.withinGroupSort,
+                            topGroups.totalHitCount,
+                            topGroups.totalGroupedHitCount,
+                            newGroupDocs);
+            } else {
+                newTopGroups = topGroups;
+            }
+            this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
+        }
+
+    }
+
  }