cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2011 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12 import java.io.IOException;
  13 import java.util.Collection;
  14
  15 import org.apache.log4j.Logger;
  16 import org.apache.lucene.analysis.Analyzer;
  17 import org.apache.lucene.index.IndexReader;
  18 import org.apache.lucene.index.Term;
  19 import org.apache.lucene.queryParser.ParseException;
  20 import org.apache.lucene.queryParser.QueryParser;
  21 import org.apache.lucene.search.BooleanClause;
  22 import org.apache.lucene.search.BooleanQuery;
  23 import org.apache.lucene.search.Filter;
  24 import org.apache.lucene.search.IndexSearcher;
  25 import org.apache.lucene.search.MultiCollector;
  26 import org.apache.lucene.search.Query;
  27 import org.apache.lucene.search.Sort;
  28 import org.apache.lucene.search.SortField;
  29 import org.apache.lucene.search.TermQuery;
  30 import org.apache.lucene.search.TopDocs;
  31 import org.apache.lucene.search.grouping.GroupDocs;
  32 import org.apache.lucene.search.grouping.SearchGroup;
  33 import org.apache.lucene.search.grouping.TermAllGroupsCollector;
  34 import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
  35 import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
  36 import org.apache.lucene.search.grouping.TopGroups;
  37 import org.hibernate.Session;
  38 import org.hibernate.search.ProjectionConstants;
  39 import org.hibernate.search.Search;
  40 import org.hibernate.search.SearchFactory;
  41
  42 import eu.etaxonomy.cdm.config.Configuration;
  43 import eu.etaxonomy.cdm.model.common.CdmBase;
  44 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  45 import eu.etaxonomy.cdm.model.description.TextData;
  46 import eu.etaxonomy.cdm.model.name.NonViralName;
  47 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
  48 import eu.etaxonomy.cdm.model.taxon.Taxon;
  49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  50
  51 /**
  52  *
  53  * @author Andreas Kohlbecker
  54  * @date Dec 21, 2011
  55  *
  56  */
  57 public class LuceneSearch {
  58
  59     protected String groupByField = "id";
  60
  61     public final static String ID_FIELD = "id";
  62
  63     public static final Logger logger = Logger.getLogger(LuceneSearch.class);
  64
  65     protected Session session;
  66
  67     protected IndexSearcher searcher;
  68
  69     protected SortField[] sortFields;
  70
  71     private Class<? extends CdmBase> directorySelectClass;
  72
  73     private Filter filter = null;
  74
  75     protected Class<? extends CdmBase> getDirectorySelectClass() {
  76         return pushAbstractBaseTypeDown(directorySelectClass);
  77     }
  78
  79     /**
  80      * classFilter
  81      */
  82     protected Class<? extends CdmBase> clazz;
  83
  84
  85     public Class<? extends CdmBase> getClazz() {
  86         return clazz;
  87     }
  88
  89     /**
  90      * @return the filter
  91      */
  92     public Filter getFilter() {
  93         return filter;
  94     }
  95
  96     /**
  97      * @param filter the filter to set
  98      */
  99     public void setFilter(Filter filter) {
 100         this.filter = filter;
 101     }
 102
 103     /**
 104      * Sets the Class to use as filter criterion, in case the supplied Class equals the
 105      * <code>directorySelectClass</code> the Class is set to <code>null</code>
 106      * @param clazz
 107      */
 108     public void setClazz(Class<? extends CdmBase> clazz) {
 109
 110         /*
 111          * NOTE:
 112          * we must not use the getter of directorySelectClass
 113          * since we need the abstract base classes here!!!!
 114          */
 115         if(clazz != null && clazz.equals(directorySelectClass)){
 116             clazz = null;
 117         }
 118         this.clazz = clazz;
 119     }
 120
 121     /**
 122      * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
 123      * otherwise PriorityQueue will produce an exception since it
 124      * will always add 1 to the maxhits so Integer.MAX_VALUE
 125      * would become Integer.MIN_VALUE
 126      */
 127     public final int MAX_HITS_ALLOWED = 10000;
 128
 129     protected Query query;
 130
 131     protected String[] highlightFields = new String[0];
 132
 133     private int maxDocsPerGroup = 10;
 134
 135
 136     public int getMaxDocsPerGroup() {
 137         return maxDocsPerGroup;
 138     }
 139
 140     public void setMaxDocsPerGroup(int maxDocsPerGroup) {
 141         this.maxDocsPerGroup = maxDocsPerGroup;
 142     }
 143
 144     /**
 145      * @param session
 146      */
 147     public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
 148          this.session = session;
 149          this.directorySelectClass = directorySelectClass;
 150     }
 151
 152     /**
 153      * @param session
 154      */
 155     public LuceneSearch(Session session, String groupByField, Class<? extends CdmBase> directorySelectClass) {
 156          this.session = session;
 157          this.directorySelectClass = directorySelectClass;
 158          this.groupByField = groupByField;
 159     }
 160
 161     /**
 162      * TODO the abstract base class DescriptionElementBase can not be used, so
 163      * we are using an arbitraty subclass to find the DirectoryProvider, future
 164      * versions of hibernate search my allow using abstract base classes see
 165      * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
 166      *
 167      * @param type must not be null
 168      * @return
 169      */
 170     protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
 171         if (type.equals(DescriptionElementBase.class)) {
 172             type = TextData.class;
 173         }
 174         if (type.equals(TaxonBase.class)) {
 175             type = Taxon.class;
 176         }
 177         if (type.equals(TaxonNameBase.class)) {
 178             type = NonViralName.class;
 179         }
 180         return type;
 181     }
 182
 183     protected LuceneSearch() {
 184
 185     }
 186
 187     /**
 188      * @return
 189      */
 190     public IndexSearcher getSearcher() {
 191         if(searcher == null){
 192             searcher = new IndexSearcher(getIndexReader());
 193             searcher.setDefaultFieldSortScoring(true, true);
 194         }
 195         return searcher;
 196     }
 197
 198     /**
 199      * @return
 200      */
 201     public IndexReader getIndexReader() {
 202         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 203         IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
 204         return reader;
 205     }
 206
 207     /**
 208      * @return
 209      */
 210     public IndexReader getIndexReaderFor(Class<? extends CdmBase> clazz) {
 211         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 212         IndexReader reader = searchFactory.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz));
 213         return reader;
 214     }
 215
 216     /**
 217      * @return
 218      */
 219     public QueryParser getQueryParser() {
 220         Analyzer analyzer = getAnalyzer();
 221         QueryParser parser = new QueryParser(Configuration.luceneVersion,  "titleCache", analyzer);
 222         return parser;
 223     }
 224
 225     /**
 226      * @return
 227      */
 228     public Analyzer getAnalyzer() {
 229         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 230         Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
 231         return analyzer;
 232     }
 233
 234     /**
 235      * @param luceneQueryString
 236      * @param clazz the type as additional filter criterion
 237      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 238      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 239      * @return
 240      * @throws ParseException
 241      * @throws IOException
 242      */
 243     public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 244
 245         Query luceneQuery = parse(luceneQueryString);
 246         this.query = luceneQuery;
 247
 248         return executeSearch(pageSize, pageNumber);
 249     }
 250
 251     /**
 252      * @param luceneQueryString
 253      * @return
 254      * @throws ParseException
 255      */
 256     public Query parse(String luceneQueryString) throws ParseException {
 257         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
 258         Query luceneQuery = getQueryParser().parse(luceneQueryString);
 259         return luceneQuery;
 260     }
 261
 262     /**
 263      * @param maxNoOfHits
 264      * @return
 265      * @throws IOException
 266      */
 267     public TopDocs executeSearch(int maxNoOfHits) throws IOException {
 268         Query fullQuery = expandQuery();
 269         logger.info("lucene query string to be parsed: " + fullQuery.toString());
 270         return getSearcher().search(fullQuery, filter, maxNoOfHits);
 271
 272     }
 273     /**
 274      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 275      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 276      * @return
 277      * @throws ParseException
 278      * @throws IOException
 279      */
 280     public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 281
 282
 283         if(pageNumber == null || pageNumber < 0){
 284             pageNumber = 0;
 285         }
 286         if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
 287             pageSize = MAX_HITS_ALLOWED;
 288             logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
 289         }
 290
 291         Query fullQuery = expandQuery();
 292         logger.info("final query: " + fullQuery.toString());
 293
 294         int offset = pageNumber * pageSize;
 295         int limit = (pageNumber + 1) * pageSize - 1 ;
 296         logger.debug("start: " + offset + "; limit:" + limit);
 297
 298         // sorting
 299         Sort groupSort = null;
 300         Sort withinGroupSort = Sort.RELEVANCE;
 301         if(sortFields != null && sortFields.length > 0){
 302             if(sortFields[0] != SortField.FIELD_SCORE){
 303                 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
 304             }
 305             groupSort = new Sort(sortFields);
 306         } else {
 307             groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
 308         }
 309
 310         // perform the search (needs two passes for grouping)
 311         if(logger.isDebugEnabled()){
 312             logger.debug("Grouping: sortFields=" + sortFields + ", groupByField=" + groupByField +
 313                     ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
 314         }
 315         // - first pass
 316         TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, withinGroupSort, limit);
 317
 318         getSearcher().search(fullQuery, filter , firstPassCollector);
 319         Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
 320
 321         if (topGroups == null) {
 322               return null;
 323         }
 324         // - second pass
 325         boolean getScores = true;
 326         boolean getMaxScores = true;
 327         boolean fillFields = true;
 328         TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
 329         TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
 330                 groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields
 331                 );
 332         getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
 333
 334         TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
 335
 336         // get max score from very first result
 337         float maxScore = groupsResult.groups[0].maxScore;
 338         if(logger.isDebugEnabled()){
 339             logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
 340                     ", totalGroupCount=" + allGroupsCollector.getGroupCount() + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
 341         }
 342         TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
 343
 344         return topGroupsWithMaxScore;
 345     }
 346
 347     /**
 348      * @param clazz
 349      */
 350     protected Query expandQuery() {
 351         Query fullQuery;
 352         if(clazz != null){
 353             BooleanQuery filteredQuery = new BooleanQuery();
 354             BooleanQuery classFilter = new BooleanQuery();
 355
 356             Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
 357             TermQuery termQuery = new TermQuery(t);
 358
 359             classFilter.setBoost(0);
 360             classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
 361
 362             filteredQuery.add(this.query, BooleanClause.Occur.MUST);
 363             filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
 364
 365             fullQuery = filteredQuery;
 366         } else {
 367             fullQuery = this.query;
 368         }
 369         return fullQuery;
 370     }
 371
 372     public void setQuery(Query query) {
 373         this.query = query;
 374     }
 375
 376     public Query getQuery() {
 377         return query;
 378     }
 379
 380     public Query getExpandedQuery() {
 381         expandQuery();
 382         return query;
 383     }
 384
 385     public SortField[] getSortFields() {
 386         return sortFields;
 387     }
 388
 389     public void setSortFields(SortField[] sortFields) {
 390         this.sortFields = sortFields;
 391     }
 392
 393     public void setHighlightFields(String[] textFieldNamesAsArray) {
 394         this.highlightFields = textFieldNamesAsArray;
 395     }
 396
 397     public String[] getHighlightFields() {
 398         return this.highlightFields;
 399     }
 400
 401     /**
 402      * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
 403      *
 404      * @author a.kohlbecker
 405      * @date Oct 4, 2012
 406      *
 407      */
 408     public class TopGroupsWithMaxScore{
 409         public TopGroups<String> topGroups;
 410         public float maxScore = Float.NaN;
 411
 412         TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
 413             this.maxScore = maxScore;
 414             TopGroups<String> newTopGroups;
 415             if(offset > 0){
 416                 GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
 417                 for(int i = offset; i < topGroups.groups.length; i++){
 418                     newGroupDocs[i - offset] = topGroups.groups[i];
 419                 }
 420                 newTopGroups = new TopGroups<String>(
 421                             topGroups.groupSort,
 422                             topGroups.withinGroupSort,
 423                             topGroups.totalHitCount,
 424                             topGroups.totalGroupedHitCount,
 425                             newGroupDocs);
 426             } else {
 427                 newTopGroups = topGroups;
 428             }
 429             this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
 430         }
 431
 432     }
 433
 434 }