cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2011 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12 import java.io.IOException;
  13 import java.util.Collection;
  14
  15 import org.apache.log4j.Logger;
  16 import org.apache.lucene.analysis.Analyzer;
  17 import org.apache.lucene.index.IndexReader;
  18 import org.apache.lucene.index.Term;
  19 import org.apache.lucene.queryParser.ParseException;
  20 import org.apache.lucene.queryParser.QueryParser;
  21 import org.apache.lucene.search.BooleanClause;
  22 import org.apache.lucene.search.BooleanQuery;
  23 import org.apache.lucene.search.IndexSearcher;
  24 import org.apache.lucene.search.MultiCollector;
  25 import org.apache.lucene.search.Query;
  26 import org.apache.lucene.search.Sort;
  27 import org.apache.lucene.search.SortField;
  28 import org.apache.lucene.search.TermQuery;
  29 import org.apache.lucene.search.TopDocs;
  30 import org.apache.lucene.search.grouping.GroupDocs;
  31 import org.apache.lucene.search.grouping.SearchGroup;
  32 import org.apache.lucene.search.grouping.TermAllGroupsCollector;
  33 import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
  34 import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
  35 import org.apache.lucene.search.grouping.TopGroups;
  36 import org.hibernate.Session;
  37 import org.hibernate.search.ProjectionConstants;
  38 import org.hibernate.search.Search;
  39 import org.hibernate.search.SearchFactory;
  40
  41 import eu.etaxonomy.cdm.config.Configuration;
  42 import eu.etaxonomy.cdm.hibernate.search.GroupByTaxonClassBridge;
  43 import eu.etaxonomy.cdm.model.common.CdmBase;
  44 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  45 import eu.etaxonomy.cdm.model.description.TextData;
  46 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
  47 import eu.etaxonomy.cdm.model.name.NonViralName;
  48 import eu.etaxonomy.cdm.model.taxon.Taxon;
  49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  50
  51 /**
  52  *
  53  * @author Andreas Kohlbecker
  54  * @date Dec 21, 2011
  55  *
  56  */
  57 public class LuceneSearch {
  58
  59     private static final String GROUP_BY_FIELD = GroupByTaxonClassBridge.GROUPBY_TAXON_FIELD;
  60
  61     public final static String ID_FIELD = "id";
  62
  63     public static final Logger logger = Logger.getLogger(LuceneSearch.class);
  64
  65     protected Session session;
  66
  67     protected IndexSearcher searcher;
  68
  69     private SortField[] sortFields;
  70
  71     private Class<? extends CdmBase> directorySelectClass;
  72
  73     protected Class<? extends CdmBase> getDirectorySelectClass() {
  74         return pushAbstractBaseTypeDown(directorySelectClass);
  75     }
  76
  77     /**
  78      * classFilter
  79      */
  80     private Class<? extends CdmBase> clazz;
  81
  82
  83     public Class<? extends CdmBase> getClazz() {
  84         return clazz;
  85     }
  86
  87     /**
  88      * Sets the Class to use as filter criterion, in case the supplied Class equals the
  89      * <code>directorySelectClass</code> the Class is set to <code>null</code>
  90      * @param clazz
  91      */
  92     public void setClazz(Class<? extends CdmBase> clazz) {
  93
  94         /*
  95          * NOTE:
  96          * we must not use the getter of directorySelectClass
  97          * since we need the abstract base classes here!!!!
  98          */
  99         if(clazz != null && clazz.equals(directorySelectClass)){
 100             clazz = null;
 101         }
 102         this.clazz = clazz;
 103     }
 104
 105     /**
 106      * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
 107      * otherwise PriorityQueue will produce an exception since it
 108      * will always add 1 to the maxhits so Integer.MAX_VALUE
 109      * would become Integer.MIN_VALUE
 110      */
 111     public final int MAX_HITS_ALLOWED = 10000;
 112
 113     protected Query query;
 114
 115     protected String[] highlightFields = new String[0];
 116
 117     private int maxDocsPerGroup = 10;
 118
 119
 120     public int getMaxDocsPerGroup() {
 121         return maxDocsPerGroup;
 122     }
 123
 124     public void setMaxDocsPerGroup(int maxDocsPerGroup) {
 125         this.maxDocsPerGroup = maxDocsPerGroup;
 126     }
 127
 128     /**
 129      * @param session
 130      */
 131     public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
 132          this.session = session;
 133          this.directorySelectClass = directorySelectClass;
 134     }
 135
 136     /**
 137      * TODO the abstract base class DescriptionElementBase can not be used, so
 138      * we are using an arbitraty subclass to find the DirectoryProvider, future
 139      * versions of hibernate search my allow using abstract base classes see
 140      * http
 141      * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
 142      * -a-given-class-in-java
 143      *
 144      * @param type must not be null
 145      * @return
 146      */
 147     protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
 148         if (type.equals(DescriptionElementBase.class)) {
 149             type = TextData.class;
 150         }
 151         if (type.equals(TaxonBase.class)) {
 152             type = Taxon.class;
 153         }
 154         if (type.equals(TaxonNameBase.class)) {
 155             type = NonViralName.class;
 156         }
 157         return type;
 158     }
 159
 160     protected LuceneSearch() {
 161
 162     }
 163
 164     /**
 165      * @return
 166      */
 167     public IndexSearcher getSearcher() {
 168         if(searcher == null){
 169             searcher = new IndexSearcher(getIndexReader());
 170             searcher.setDefaultFieldSortScoring(true, true);
 171         }
 172         return searcher;
 173     }
 174
 175     /**
 176      * @return
 177      */
 178     public IndexReader getIndexReader() {
 179         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 180
 181 //        OLD
 182 //        DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
 183 //        logger.info(directoryProviders[0].getDirectory().toString());
 184
 185 //        ReaderProvider readerProvider = searchFactory.getReaderProvider();
 186 //        IndexReader reader = readerProvider.openReader(directoryProviders[0]);
 187
 188         IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
 189         return reader;
 190     }
 191
 192     /**
 193      * @return
 194      */
 195     public QueryParser getQueryParser() {
 196         Analyzer analyzer = getAnalyzer();
 197         QueryParser parser = new QueryParser(Configuration.luceneVersion,  "titleCache", analyzer);
 198         return parser;
 199     }
 200
 201     /**
 202      * @return
 203      */
 204     public Analyzer getAnalyzer() {
 205         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 206         Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
 207         return analyzer;
 208     }
 209
 210     /**
 211      * @param luceneQueryString
 212      * @param clazz the type as additional filter criterion
 213      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 214      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 215      * @return
 216      * @throws ParseException
 217      * @throws IOException
 218      */
 219     public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 220
 221         Query luceneQuery = parse(luceneQueryString);
 222         this.query = luceneQuery;
 223
 224         return executeSearch(pageSize, pageNumber);
 225     }
 226
 227     /**
 228      * @param luceneQueryString
 229      * @return
 230      * @throws ParseException
 231      */
 232     public Query parse(String luceneQueryString) throws ParseException {
 233         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
 234         Query luceneQuery = getQueryParser().parse(luceneQueryString);
 235         return luceneQuery;
 236     }
 237
 238     /**
 239      * @param maxNoOfHits
 240      * @return
 241      * @throws IOException
 242      */
 243     public TopDocs executeSearch(int maxNoOfHits) throws IOException {
 244         Query fullQuery = expandQuery();
 245         logger.info("lucene query string to be parsed: " + fullQuery.toString());
 246         return getSearcher().search(fullQuery, maxNoOfHits);
 247
 248     }
 249     /**
 250      * @param luceneQuery
 251      * @param clazz the type as additional filter criterion
 252      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 253      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 254      * @return
 255      * @throws ParseException
 256      * @throws IOException
 257      */
 258     public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 259
 260
 261         if(pageNumber == null || pageNumber < 0){
 262             pageNumber = 0;
 263         }
 264         if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
 265             pageSize = MAX_HITS_ALLOWED;
 266             logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
 267         }
 268
 269         Query fullQuery = expandQuery();
 270         logger.info("final query: " + fullQuery.toString());
 271
 272         int offset = pageNumber * pageSize;
 273         int limit = (pageNumber + 1) * pageSize - 1 ;
 274         logger.debug("start: " + offset + "; limit:" + limit);
 275
 276         // sorting
 277         Sort groupSort = null;
 278         Sort withinGroupSort = Sort.RELEVANCE;
 279         if(sortFields != null && sortFields.length > 0){
 280             if(sortFields[0] != SortField.FIELD_SCORE){
 281                 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
 282             }
 283             groupSort = new Sort(sortFields);
 284         } else {
 285             groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
 286         }
 287
 288         // perform the search (needs two passes for grouping)
 289         // - first pass
 290         TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(GROUP_BY_FIELD, withinGroupSort, limit);
 291         getSearcher().search(fullQuery, firstPassCollector);
 292         Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
 293
 294         if (topGroups == null) {
 295               return null;
 296         }
 297         // - second pass
 298         boolean getScores = true;
 299         boolean getMaxScores = true;
 300         boolean fillFields = true;
 301         TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(GROUP_BY_FIELD);
 302         TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(GROUP_BY_FIELD, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields);
 303         getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
 304
 305         TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
 306
 307         // get max score from very first result
 308         float maxScore = groupsResult.groups[0].maxScore;
 309         TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
 310
 311         return topGroupsWithMaxScore;
 312     }
 313
 314     /**
 315      * @param clazz
 316      */
 317     protected Query expandQuery() {
 318         Query fullQuery;
 319         if(clazz != null){
 320             BooleanQuery filteredQuery = new BooleanQuery();
 321             BooleanQuery classFilter = new BooleanQuery();
 322
 323             Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
 324             TermQuery termQuery = new TermQuery(t);
 325
 326             classFilter.setBoost(0);
 327             classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
 328
 329             filteredQuery.add(this.query, BooleanClause.Occur.MUST);
 330             filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
 331
 332             fullQuery = filteredQuery;
 333         } else {
 334             fullQuery = this.query;
 335         }
 336         return fullQuery;
 337     }
 338
 339     public void setQuery(Query query) {
 340         this.query = query;
 341     }
 342
 343     public Query getQuery() {
 344         return query;
 345     }
 346
 347     public Query getExpandedQuery() {
 348         expandQuery();
 349         return query;
 350     }
 351
 352     public SortField[] getSortFields() {
 353         return sortFields;
 354     }
 355
 356     public void setSortFields(SortField[] sortFields) {
 357         this.sortFields = sortFields;
 358     }
 359
 360     public void setHighlightFields(String[] textFieldNamesAsArray) {
 361         this.highlightFields = textFieldNamesAsArray;
 362     }
 363
 364     public String[] getHighlightFields() {
 365         return this.highlightFields;
 366     }
 367
 368     /**
 369      * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
 370      *
 371      * @author a.kohlbecker
 372      * @date Oct 4, 2012
 373      *
 374      */
 375     public class TopGroupsWithMaxScore{
 376         public TopGroups<String> topGroups;
 377         public float maxScore = Float.NaN;
 378
 379         TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
 380             this.maxScore = maxScore;
 381             TopGroups<String> newTopGroups;
 382             if(offset > 0){
 383                 GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
 384                 for(int i = offset; i < topGroups.groups.length; i++){
 385                     newGroupDocs[i - offset] = topGroups.groups[i];
 386                 }
 387                 newTopGroups = new TopGroups<String>(
 388                             topGroups.groupSort,
 389                             topGroups.withinGroupSort,
 390                             topGroups.totalHitCount,
 391                             topGroups.totalGroupedHitCount,
 392                             newGroupDocs);
 393             } else {
 394                 newTopGroups = topGroups;
 395             }
 396             this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
 397         }
 398
 399     }
 400
 401 }