cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

   1 /**
   2 * Copyright (C) 2011 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9 package eu.etaxonomy.cdm.api.service.search;
  10
  11 import java.io.IOException;
  12 import java.util.Arrays;
  13 import java.util.Collection;
  14
  15 import org.apache.logging.log4j.LogManager;
  16 import org.apache.logging.log4j.Logger;
  17 import org.apache.lucene.analysis.Analyzer;
  18 import org.apache.lucene.queryparser.classic.ParseException;
  19 import org.apache.lucene.search.BooleanClause.Occur;
  20 import org.apache.lucene.search.BooleanQuery;
  21 import org.apache.lucene.search.BooleanQuery.Builder;
  22 import org.apache.lucene.search.IndexSearcher;
  23 import org.apache.lucene.search.MultiCollector;
  24 import org.apache.lucene.search.Query;
  25 import org.apache.lucene.search.Sort;
  26 import org.apache.lucene.search.SortField;
  27 import org.apache.lucene.search.TopDocs;
  28 import org.apache.lucene.search.grouping.GroupDocs;
  29 import org.apache.lucene.search.grouping.SearchGroup;
  30 import org.apache.lucene.search.grouping.TopGroups;
  31 import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
  32 import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
  33 import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
  34 import org.apache.lucene.util.BytesRef;
  35
  36 import eu.etaxonomy.cdm.model.common.CdmBase;
  37 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  38 import eu.etaxonomy.cdm.model.description.TextData;
  39 import eu.etaxonomy.cdm.model.taxon.Taxon;
  40 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  41
  42 /**
  43  * @author Andreas Kohlbecker
  44  * @since Dec 21, 2011
  45  */
  46 public class LuceneSearch {
  47
  48     private static final Logger logger = LogManager.getLogger();
  49
  50     public final static String ID_FIELD = "id";
  51
  52     /**
  53      * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
  54      * otherwise PriorityQueue will produce an exception since it
  55      * will always add 1 to the maxhits so Integer.MAX_VALUE
  56      * would become Integer.MIN_VALUE
  57      */
  58     public final int MAX_HITS_ALLOWED = 10000;
  59
  60
  61     protected String groupByField = "id";
  62
  63     protected ILuceneIndexToolProvider toolProvider;
  64
  65     protected IndexSearcher searcher;
  66
  67     protected SortField[] sortFields;
  68
  69     private Class<? extends CdmBase> directorySelectClass;
  70
  71     private BooleanQuery filter = null;
  72
  73     //class filter
  74     protected Class<? extends CdmBase> cdmTypeRestriction;
  75
  76     protected BooleanQuery query;
  77
  78     protected String[] highlightFields = new String[0];
  79
  80     private int maxDocsPerGroup = 10;
  81
  82     protected Class<? extends CdmBase> getDirectorySelectClass() {
  83         return pushAbstractBaseTypeDown(directorySelectClass);
  84     }
  85
  86     public Class<? extends CdmBase> getCdmTypRestriction() {
  87         return cdmTypeRestriction;
  88     }
  89
  90     //filter
  91     public BooleanQuery getFilter() {
  92         return filter;
  93     }
  94     public void setFilter(BooleanQuery filter) {
  95         this.filter = filter;
  96     }
  97
  98     /**
  99      * Sets the Class to use as filter criterion, in case the supplied Class equals the
 100      * <code>directorySelectClass</code> the Class is set to <code>null</code>
 101      * @param clazz
 102      */
 103     public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
 104
 105         /*
 106          * NOTE:
 107          * we must not use the getter of directorySelectClass
 108          * since we need the abstract base classes here!!!!
 109          */
 110         if(clazz != null && clazz.equals(directorySelectClass)){
 111             clazz = null;
 112         }
 113         this.cdmTypeRestriction = clazz;
 114     }
 115
 116     public int getMaxDocsPerGroup() {
 117         return maxDocsPerGroup;
 118     }
 119     public void setMaxDocsPerGroup(int maxDocsPerGroup) {
 120         this.maxDocsPerGroup = maxDocsPerGroup;
 121     }
 122
 123     public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
 124          this.toolProvider = toolProvider;
 125          this.directorySelectClass = directorySelectClass;
 126     }
 127     public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
 128         this.toolProvider = toolProvider;
 129         this.directorySelectClass = directorySelectClass;
 130         this.groupByField = groupByField;
 131     }
 132
 133     /**
 134      * TODO the abstract base class DescriptionElementBase can not be used, so
 135      * we are using an arbitrary subclass to find the DirectoryProvider, future
 136      * versions of hibernate search my allow using abstract base classes see
 137      * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
 138      *
 139      * @param type must not be null
 140      * @return
 141      */
 142     private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
 143         Class<? extends CdmBase> returnType = type;
 144         if (type.equals(DescriptionElementBase.class)) {
 145             returnType = TextData.class;
 146         }
 147         if (type.equals(TaxonBase.class)) {
 148             returnType = Taxon.class;
 149         }
 150 //        if (type.equals(TaxonName.class)) {
 151 //            returnType = NonViralName.class;
 152 //        }
 153         return returnType;
 154     }
 155
 156     protected LuceneSearch() {
 157
 158     }
 159
 160     public IndexSearcher getSearcher() {
 161         if(searcher == null){
 162             searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
 163 //            searcher.setDefaultFieldSortScoring(true, true);
 164         }
 165         return searcher;
 166     }
 167
 168     /**
 169      * Convenience method which delegated the call to the available
 170      * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
 171      *
 172      * @return the Analyzer suitable for the <code>directorySelectClass</code>
 173      * of the LuceneSearch
 174      */
 175     public Analyzer getAnalyzer() {
 176         return toolProvider.getAnalyzerFor(directorySelectClass);
 177     }
 178
 179     /**
 180      * @param luceneQueryString
 181      * @param cdmTypeRestriction the type as additional filter criterion
 182      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 183      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 184      * @return
 185      * @throws ParseException
 186      * @throws IOException
 187      */
 188     public TopGroups<BytesRef> executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 189
 190         Query luceneQuery = parse(luceneQueryString);
 191         setQuery(luceneQuery);
 192         return executeSearch(pageSize, pageNumber);
 193     }
 194
 195     /**
 196      * @param luceneQueryString
 197      * @return
 198      * @throws ParseException
 199      */
 200     public Query parse(String luceneQueryString) throws ParseException {
 201         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
 202         Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass, false).parse(luceneQueryString);
 203         return luceneQuery;
 204     }
 205
 206     /**
 207      * @param maxNoOfHits
 208      * @return
 209      * @throws IOException
 210      */
 211     public TopDocs executeSearch(int maxNoOfHits) throws IOException {
 212         BooleanQuery fullQuery = expandQuery();
 213         logger.info("lucene query string to be parsed: " + fullQuery.toString());
 214         return getSearcher().search(fullQuery, maxNoOfHits, Sort.RELEVANCE, true, true);
 215
 216     }
 217     /**
 218      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 219      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 220      * @return
 221      * @throws ParseException
 222      * @throws IOException
 223      */
 224     public TopGroups<BytesRef> executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 225
 226         if(pageNumber == null || pageNumber < 0){
 227             pageNumber = 0;
 228         }
 229         if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
 230             pageSize = MAX_HITS_ALLOWED;
 231             logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
 232         }
 233
 234         BooleanQuery fullQuery = expandQuery();
 235         logger.info("final query: " + fullQuery.toString());
 236
 237         int offset = pageNumber * pageSize;
 238         int limit = (pageNumber + 1) * pageSize;
 239         logger.debug("start: " + offset + "; limit:" + limit);
 240
 241         // sorting
 242         Sort groupSort = null;
 243         Sort withinGroupSort = Sort.RELEVANCE;
 244         if(sortFields != null && sortFields.length > 0){
 245             groupSort = new Sort(sortFields);
 246         } else {
 247             groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
 248         }
 249
 250         // perform the search (needs two passes for grouping)
 251         if(logger.isDebugEnabled()){
 252             logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
 253                     ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
 254         }
 255         // - first pass
 256         TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, groupSort, limit);
 257
 258         getSearcher().search(fullQuery, firstPassCollector);
 259         Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
 260
 261         if (topGroups == null) {
 262               return null;
 263         }
 264         // - flags for second pass
 265         boolean getScores = false;
 266         boolean getMaxScores = true;
 267         if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
 268             getMaxScores = false;
 269             // see inner class TopGroupsWithMaxScore
 270             logger.warn("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
 271         }
 272         boolean fillFields = true;
 273         TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
 274         TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
 275                 groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
 276                 getMaxScores, fillFields
 277                 );
 278         getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
 279
 280         TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
 281
 282         // --- set the max score for the group results
 283
 284         // get max score from very first result
 285         float maxScore = groupsResult.groups[0].maxScore;
 286
 287         if(logger.isDebugEnabled()){
 288             logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
 289                     ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
 290                     ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
 291         }
 292
 293         TopGroups<BytesRef> newTopGroups;
 294         if(offset > 0){
 295             GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[groupsResult.groups.length - offset];
 296             for(int i = offset; i < groupsResult.groups.length; i++){
 297                 newGroupDocs[i - offset] = groupsResult.groups[i];
 298             }
 299             newTopGroups = new TopGroups<BytesRef>(
 300                     groupsResult.groupSort,
 301                     groupsResult.withinGroupSort,
 302                     groupsResult.totalHitCount,
 303                     groupsResult.totalGroupedHitCount,
 304                         newGroupDocs,
 305                         maxScore);
 306         } else {
 307             newTopGroups = groupsResult;
 308         }
 309         TopGroups<BytesRef> topGroupsWithMaxScore = new TopGroups<>(newTopGroups, allGroupsCollector.getGroupCount());
 310         // --- done with max score for the group results
 311
 312         return topGroupsWithMaxScore;
 313     }
 314
 315     /**
 316      * expands the query by adding a type restriction if the
 317      * <code>cdmTypeRestriction</code> is not <code>NULL</code>
 318      * and adds the <code>filter</code> as Boolean query
 319      * clause with {@link Occur#FILTER}
 320      */
 321     protected BooleanQuery expandQuery() {
 322         BooleanQuery fullQuery = null;
 323         Builder fullQueryBuilder = null;
 324
 325         if(cdmTypeRestriction != null){
 326             fullQueryBuilder = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
 327         }
 328
 329         if(filter != null) {
 330             if(fullQueryBuilder == null) {
 331                 fullQueryBuilder = new Builder();
 332                 fullQueryBuilder.add(this.query, Occur.MUST);
 333             }
 334             fullQueryBuilder.add(filter, Occur.FILTER);
 335         }
 336
 337         if(fullQueryBuilder != null) {
 338             fullQuery = fullQueryBuilder.build();
 339         } else {
 340             fullQuery = this.query;
 341         }
 342
 343         logger.debug("expandedQuery: " + fullQuery.toString());
 344         return fullQuery;
 345     }
 346
 347     public void setQuery(Query query) {
 348         if( query instanceof BooleanQuery) {
 349             this.query = (BooleanQuery)query;
 350         } else {
 351             Builder builder = new Builder();
 352             this.query = builder.add(query, Occur.MUST).build();
 353         }
 354     }
 355
 356     public BooleanQuery getQuery() {
 357         return query;
 358     }
 359
 360     public BooleanQuery getExpandedQuery() {
 361         expandQuery();
 362         return query;
 363     }
 364
 365     public SortField[] getSortFields() {
 366         return sortFields;
 367     }
 368
 369     public void setSortFields(SortField[] sortFields) {
 370         this.sortFields = sortFields;
 371     }
 372
 373     public void setHighlightFields(String[] textFieldNamesAsArray) {
 374         this.highlightFields = textFieldNamesAsArray;
 375     }
 376
 377     public String[] getHighlightFields() {
 378         return this.highlightFields;
 379     }
 380
 381 }