X-Git-Url: https://dev.e-taxonomy.eu/gitweb/cdmlib.git/blobdiff_plain/fd93c8b355df3320be4afacdb9460f370d36e149..f6b282fc9660b250f81879ec6b255517f13ec229:/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java
index 6a93c0086b..57d338a00b 100644
--- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java
+++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java
@@ -10,34 +10,35 @@
 package eu.etaxonomy.cdm.api.service.search;
 
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
 
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.BooleanQuery.Builder;
+import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiCollector;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
-import org.hibernate.Session;
-import org.hibernate.search.Search;
-import org.hibernate.search.SearchFactory;
-import org.hibernate.search.engine.DocumentBuilder;
-import org.hibernate.search.reader.ReaderProvider;
-import org.hibernate.search.store.DirectoryProvider;
+import org.apache.lucene.search.grouping.GroupDocs;
+import org.apache.lucene.search.grouping.SearchGroup;
+import org.apache.lucene.search.grouping.TopGroups;
+import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
+import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
+import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
+import org.apache.lucene.util.BytesRef;
 
 import eu.etaxonomy.cdm.model.common.CdmBase;
 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
 import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.name.NonViralName;
+import eu.etaxonomy.cdm.model.name.TaxonNameBase;
 import eu.etaxonomy.cdm.model.taxon.Taxon;
 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
 
@@ -49,16 +50,22 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  */
 public class LuceneSearch {
 
+    protected String groupByField = "id";
+
+    public final static String ID_FIELD = "id";
+
     public static final Logger logger = Logger.getLogger(LuceneSearch.class);
 
-    protected Session session;
+    protected ILuceneIndexToolProvider toolProvider;
 
-    protected Searcher searcher;
+    protected IndexSearcher searcher;
 
-    private SortField[] sortFields;
+    protected SortField[] sortFields;
 
     private Class<? extends CdmBase> directorySelectClass;
 
+    private Filter filter = null;
+
     protected Class<? extends CdmBase> getDirectorySelectClass() {
         return pushAbstractBaseTypeDown(directorySelectClass);
     }
@@ -66,11 +73,25 @@ public class LuceneSearch {
     /**
      * classFilter
      */
-    private Class<? extends CdmBase> clazz;
+    protected Class<? extends CdmBase> cdmTypeRestriction;
 
 
-    public Class<? extends CdmBase> getClazz() {
-        return clazz;
+    public Class<? extends CdmBase> getCdmTypRestriction() {
+        return cdmTypeRestriction;
+    }
+
+    /**
+     * @return the filter
+     */
+    public Filter getFilter() {
+        return filter;
+    }
+
+    /**
+     * @param filter the filter to set
+     */
+    public void setFilter(Filter filter) {
+        this.filter = filter;
     }
 
     /**
@@ -78,7 +99,7 @@ public class LuceneSearch {
      * <code>directorySelectClass</code> the Class is set to <code>null</code>
      * @param clazz
      */
-    public void setClazz(Class<? extends CdmBase> clazz) {
+    public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
 
         /*
          * NOTE:
@@ -88,7 +109,7 @@ public class LuceneSearch {
         if(clazz != null && clazz.equals(directorySelectClass)){
             clazz = null;
         }
-        this.clazz = clazz;
+        this.cdmTypeRestriction = clazz;
     }
 
     /**
@@ -99,38 +120,59 @@ public class LuceneSearch {
      */
     public final int MAX_HITS_ALLOWED = 10000;
 
-    protected Query query;
+    protected BooleanQuery query;
 
     protected String[] highlightFields = new String[0];
 
+    private int maxDocsPerGroup = 10;
+
+
+    public int getMaxDocsPerGroup() {
+        return maxDocsPerGroup;
+    }
+
+    public void setMaxDocsPerGroup(int maxDocsPerGroup) {
+        this.maxDocsPerGroup = maxDocsPerGroup;
+    }
 
     /**
      * @param session
      */
-    public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
-         this.session = session;
+    public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
+         this.toolProvider = toolProvider;
          this.directorySelectClass = directorySelectClass;
     }
 
+    /**
+     * @param session
+     */
+    public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
+        this.toolProvider = toolProvider;
+        this.directorySelectClass = directorySelectClass;
+        this.groupByField = groupByField;
+    }
+
     /**
      * TODO the abstract base class DescriptionElementBase can not be used, so
-     * we are using an arbitraty subclass to find the DirectoryProvider, future
+     * we are using an arbitrary subclass to find the DirectoryProvider, future
      * versions of hibernate search my allow using abstract base classes see
-     * http
-     * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
-     * -a-given-class-in-java
+     * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
      *
      * @param type must not be null
      * @return
      */
-    protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
+    private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
+        Class<? extends CdmBase> returnType = type;
         if (type.equals(DescriptionElementBase.class)) {
-            type = TextData.class;
+            returnType = TextData.class;
         }
         if (type.equals(TaxonBase.class)) {
-            type = Taxon.class;
+            returnType = Taxon.class;
         }
-        return type;
+        if (type.equals(TaxonNameBase.class)) {
+            returnType = NonViralName.class;
+        }
+        return returnType;
     }
 
     protected LuceneSearch() {
@@ -140,59 +182,38 @@ public class LuceneSearch {
     /**
      * @return
      */
-    public Searcher getSearcher() {
+    public IndexSearcher getSearcher() {
         if(searcher == null){
-            searcher = new IndexSearcher(getIndexReader());
+            searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
+//            searcher.setDefaultFieldSortScoring(true, true);
         }
         return searcher;
     }
 
     /**
-     * @return
-     */
-    public IndexReader getIndexReader() {
-        SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
-
-        DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
-        logger.info(directoryProviders[0].getDirectory().toString());
-
-        ReaderProvider readerProvider = searchFactory.getReaderProvider();
-        IndexReader reader = readerProvider.openReader(directoryProviders[0]);
-        return reader;
-    }
-
-    /**
-     * @return
-     */
-    public QueryParser getQueryParser() {
-        Analyzer analyzer = getAnalyzer();
-        QueryParser parser = new QueryParser("titleCache", analyzer);
-        return parser;
-    }
-
-    /**
-     * @return
+     * Convenience method which delegated the call to the available
+     * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
+     *
+     * @return the Analyzer suitable for the <code>directorySelectClass</code>
+     * of the LuceneSearch
      */
     public Analyzer getAnalyzer() {
-        SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
-        Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
-        return analyzer;
+        return toolProvider.getAnalyzerFor(directorySelectClass);
     }
 
     /**
      * @param luceneQueryString
-     * @param clazz the type as additional filter criterion
+     * @param cdmTypeRestriction the type as additional filter criterion
      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
      * @return
      * @throws ParseException
      * @throws IOException
      */
-    public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+    public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 
         Query luceneQuery = parse(luceneQueryString);
-        this.query = luceneQuery;
-
+        setQuery(luceneQuery);
         return executeSearch(pageSize, pageNumber);
     }
 
@@ -203,20 +224,29 @@ public class LuceneSearch {
      */
     public Query parse(String luceneQueryString) throws ParseException {
         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
-        Query luceneQuery = getQueryParser().parse(luceneQueryString);
+        Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass).parse(luceneQueryString);
         return luceneQuery;
     }
 
     /**
-     * @param luceneQuery
-     * @param clazz the type as additional filter criterion
+     * @param maxNoOfHits
+     * @return
+     * @throws IOException
+     */
+    public TopDocs executeSearch(int maxNoOfHits) throws IOException {
+        Query fullQuery = expandQuery();
+        logger.info("lucene query string to be parsed: " + fullQuery.toString());
+        return getSearcher().search(fullQuery, filter, maxNoOfHits);
+
+    }
+    /**
      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
      * @return
      * @throws ParseException
      * @throws IOException
      */
-    public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+    public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 
 
         if(pageNumber == null || pageNumber < 0){
@@ -228,65 +258,75 @@ public class LuceneSearch {
         }
 
         Query fullQuery = expandQuery();
-
         logger.info("final query: " + fullQuery.toString());
 
-        int start = pageNumber * pageSize;
+        int offset = pageNumber * pageSize;
         int limit = (pageNumber + 1) * pageSize - 1 ;
+        logger.debug("start: " + offset + "; limit:" + limit);
 
-        logger.debug("start: " + start + "; limit:" + limit);
-
-        TopDocs topDocs;
+        // sorting
+        Sort groupSort = null;
+        Sort withinGroupSort = Sort.RELEVANCE;
         if(sortFields != null && sortFields.length > 0){
-            Sort sort = new Sort(sortFields);
-            topDocs = getSearcher().search(fullQuery, null, limit, sort);
+            groupSort = new Sort(sortFields);
         } else {
-            topDocs = getSearcher().search(fullQuery, null, limit);
+            groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
         }
 
+        // perform the search (needs two passes for grouping)
+        if(logger.isDebugEnabled()){
+            logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
+                    ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
+        }
+        // - first pass
+        TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(
+                groupByField, groupSort, limit);
 
-        //TODO when switched to Lucene 3.x which is included in hibernate 4.x
-        //     use TopDocCollector.topDocs(int start, int howMany);
-        //     since this method might be more memory save than our own implementation
-        //
-        //     ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
-        //
-//        TopDocs topDocs = hitCollector.topDocs();
-        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-
-        int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
-        logger.debug("docsAvailableInPage:" + docsAvailableInPage);
+        getSearcher().search(fullQuery, filter , firstPassCollector);
+        Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
 
-        ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
-        for(int i = 0; i < docsAvailableInPage; i++){
-            pagedDocs[i] = scoreDocs[start + i];
+        if (topGroups == null) {
+              return null;
+        }
+        // - flags for second pass
+        boolean getScores = false;
+        boolean getMaxScores = true;
+        if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
+            getMaxScores = false;
+            // see inner class TopGroupsWithMaxScore
+            logger.error("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
+        }
+        boolean fillFields = true;
+        TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
+        TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
+                groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
+                getMaxScores, fillFields
+                );
+        getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
+
+        TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
+
+        // get max score from very first result
+        float maxScore = groupsResult.groups[0].maxScore;
+        if(logger.isDebugEnabled()){
+            logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
+                    ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
+                    ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
         }
-        TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
-        //
-        /////////////////////////////////////////////
+        TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult,
+                offset, allGroupsCollector.getGroupCount(), maxScore);
 
-        return pagedTopDocs;
+        return topGroupsWithMaxScore;
     }
 
     /**
-     * @param clazz
+     * expands the query by adding a type restriction if the
+     * <code>cdmTypeRestriction</code> is not <code>NULL</code>
      */
     protected Query expandQuery() {
-        Query fullQuery;
-        if(clazz != null){
-            BooleanQuery filteredQuery = new BooleanQuery();
-            BooleanQuery classFilter = new BooleanQuery();
-
-            Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
-            TermQuery termQuery = new TermQuery(t);
-
-            classFilter.setBoost(0);
-            classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
-
-            filteredQuery.add(this.query, BooleanClause.Occur.MUST);
-            filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
-
-            fullQuery = filteredQuery;
+        BooleanQuery fullQuery;
+        if(cdmTypeRestriction != null){
+            fullQuery = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
         } else {
             fullQuery = this.query;
         }
@@ -294,10 +334,15 @@ public class LuceneSearch {
     }
 
     public void setQuery(Query query) {
-        this.query = query;
+        if( query instanceof BooleanQuery) {
+            this.query = (BooleanQuery)query;
+        } else {
+            Builder builder = new Builder();
+            this.query = builder.add(query, Occur.MUST).build();
+        }
     }
 
-    public Query getQuery() {
+    public BooleanQuery getQuery() {
         return query;
     }
 
@@ -316,11 +361,44 @@ public class LuceneSearch {
 
     public void setHighlightFields(String[] textFieldNamesAsArray) {
         this.highlightFields = textFieldNamesAsArray;
-
     }
 
     public String[] getHighlightFields() {
         return this.highlightFields;
     }
 
+    /**
+     * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
+     *
+     * @author a.kohlbecker
+     * @date Oct 4, 2012
+     *
+     */
+    public class TopGroupsWithMaxScore{
+        public TopGroups<BytesRef> topGroups;
+        public float maxScore = Float.NaN;
+
+        TopGroupsWithMaxScore(TopGroups<BytesRef> topGroups, int offset, int totalGroupCount, float maxScore){
+            this.maxScore = maxScore;
+            TopGroups<BytesRef> newTopGroups;
+            if(offset > 0){
+                GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
+                for(int i = offset; i < topGroups.groups.length; i++){
+                    newGroupDocs[i - offset] = topGroups.groups[i];
+                }
+                newTopGroups = new TopGroups<BytesRef>(
+                            topGroups.groupSort,
+                            topGroups.withinGroupSort,
+                            topGroups.totalHitCount,
+                            topGroups.totalGroupedHitCount,
+                            newGroupDocs,
+                            maxScore);
+            } else {
+                newTopGroups = topGroups;
+            }
+            this.topGroups = new TopGroups<BytesRef>(newTopGroups, totalGroupCount);
+        }
+
+    }
+
 }