cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2011 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12 import java.io.IOException;
  13 import java.util.Collection;
  14
  15 import org.apache.log4j.Logger;
  16 import org.apache.lucene.analysis.Analyzer;
  17 import org.apache.lucene.index.IndexReader;
  18 import org.apache.lucene.index.Term;
  19 import org.apache.lucene.queryParser.ParseException;
  20 import org.apache.lucene.queryParser.QueryParser;
  21 import org.apache.lucene.search.BooleanClause;
  22 import org.apache.lucene.search.BooleanQuery;
  23 import org.apache.lucene.search.Hits;
  24 import org.apache.lucene.search.IndexSearcher;
  25 import org.apache.lucene.search.MultiCollector;
  26 import org.apache.lucene.search.Query;
  27 import org.apache.lucene.search.ScoreDoc;
  28 import org.apache.lucene.search.Searcher;
  29 import org.apache.lucene.search.Sort;
  30 import org.apache.lucene.search.SortField;
  31 import org.apache.lucene.search.TermQuery;
  32 import org.apache.lucene.search.TopDocs;
  33 import org.apache.lucene.search.grouping.AllGroupsCollector;
  34 import org.apache.lucene.search.grouping.FirstPassGroupingCollector;
  35 import org.apache.lucene.search.grouping.SearchGroup;
  36 import org.apache.lucene.search.grouping.SecondPassGroupingCollector;
  37 import org.apache.lucene.search.grouping.TopGroups;
  38 import org.hibernate.Session;
  39 import org.hibernate.search.Search;
  40 import org.hibernate.search.SearchFactory;
  41 import org.hibernate.search.engine.DocumentBuilder;
  42 import org.hibernate.search.reader.ReaderProvider;
  43 import org.hibernate.search.store.DirectoryProvider;
  44
  45 import eu.etaxonomy.cdm.model.common.CdmBase;
  46 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  47 import eu.etaxonomy.cdm.model.description.TextData;
  48 import eu.etaxonomy.cdm.model.taxon.Taxon;
  49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  50
  51 /**
  52  *
  53  * @author Andreas Kohlbecker
  54  * @date Dec 21, 2011
  55  *
  56  */
  57 public class LuceneSearch {
  58
  59     private static final String GROUP_BY_FIELD = "id";
  60
  61     public static final Logger logger = Logger.getLogger(LuceneSearch.class);
  62
  63     protected Session session;
  64
  65     protected IndexSearcher searcher;
  66
  67     private SortField[] sortFields;
  68
  69     private Class<? extends CdmBase> directorySelectClass;
  70
  71     protected Class<? extends CdmBase> getDirectorySelectClass() {
  72         return pushAbstractBaseTypeDown(directorySelectClass);
  73     }
  74
  75     /**
  76      * classFilter
  77      */
  78     private Class<? extends CdmBase> clazz;
  79
  80
  81     public Class<? extends CdmBase> getClazz() {
  82         return clazz;
  83     }
  84
  85     /**
  86      * Sets the Class to use as filter criterion, in case the supplied Class equals the
  87      * <code>directorySelectClass</code> the Class is set to <code>null</code>
  88      * @param clazz
  89      */
  90     public void setClazz(Class<? extends CdmBase> clazz) {
  91
  92         /*
  93          * NOTE:
  94          * we must not use the getter of directorySelectClass
  95          * since we need the abstract base classes here!!!!
  96          */
  97         if(clazz != null && clazz.equals(directorySelectClass)){
  98             clazz = null;
  99         }
 100         this.clazz = clazz;
 101     }
 102
 103     /**
 104      * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
 105      * otherwise PriorityQueue will produce an exception since it
 106      * will always add 1 to the maxhits so Integer.MAX_VALUE
 107      * would become Integer.MIN_VALUE
 108      */
 109     public final int MAX_HITS_ALLOWED = 10000;
 110
 111     protected Query query;
 112
 113     protected String[] highlightFields = new String[0];
 114
 115
 116     /**
 117      * @param session
 118      */
 119     public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
 120          this.session = session;
 121          this.directorySelectClass = directorySelectClass;
 122     }
 123
 124     /**
 125      * TODO the abstract base class DescriptionElementBase can not be used, so
 126      * we are using an arbitraty subclass to find the DirectoryProvider, future
 127      * versions of hibernate search my allow using abstract base classes see
 128      * http
 129      * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
 130      * -a-given-class-in-java
 131      *
 132      * @param type must not be null
 133      * @return
 134      */
 135     protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
 136         if (type.equals(DescriptionElementBase.class)) {
 137             type = TextData.class;
 138         }
 139         if (type.equals(TaxonBase.class)) {
 140             type = Taxon.class;
 141         }
 142         return type;
 143     }
 144
 145     protected LuceneSearch() {
 146
 147     }
 148
 149     /**
 150      * @return
 151      */
 152     public Searcher getSearcher() {
 153         if(searcher == null){
 154             searcher = new IndexSearcher(getIndexReader());
 155             searcher.setDefaultFieldSortScoring(true, true);
 156         }
 157         return searcher;
 158     }
 159
 160     /**
 161      * @return
 162      */
 163     public IndexReader getIndexReader() {
 164         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 165
 166         DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
 167         logger.info(directoryProviders[0].getDirectory().toString());
 168
 169         ReaderProvider readerProvider = searchFactory.getReaderProvider();
 170         IndexReader reader = readerProvider.openReader(directoryProviders[0]);
 171         return reader;
 172     }
 173
 174     /**
 175      * @return
 176      */
 177     public QueryParser getQueryParser() {
 178         Analyzer analyzer = getAnalyzer();
 179         QueryParser parser = new QueryParser("titleCache", analyzer);
 180         return parser;
 181     }
 182
 183     /**
 184      * @return
 185      */
 186     public Analyzer getAnalyzer() {
 187         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 188         Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
 189         return analyzer;
 190     }
 191
 192     /**
 193      * @param luceneQueryString
 194      * @param clazz the type as additional filter criterion
 195      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 196      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 197      * @return
 198      * @throws ParseException
 199      * @throws IOException
 200      */
 201     public TopGroups executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 202
 203         Query luceneQuery = parse(luceneQueryString);
 204         this.query = luceneQuery;
 205
 206         return executeSearch(pageSize, pageNumber);
 207     }
 208
 209     /**
 210      * @param luceneQueryString
 211      * @return
 212      * @throws ParseException
 213      */
 214     public Query parse(String luceneQueryString) throws ParseException {
 215         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
 216         Query luceneQuery = getQueryParser().parse(luceneQueryString);
 217         return luceneQuery;
 218     }
 219
 220     /**
 221      * @param luceneQuery
 222      * @param clazz the type as additional filter criterion
 223      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 224      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 225      * @return
 226      * @throws ParseException
 227      * @throws IOException
 228      */
 229     public TopGroups executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 230
 231
 232         if(pageNumber == null || pageNumber < 0){
 233             pageNumber = 0;
 234         }
 235         if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
 236             pageSize = MAX_HITS_ALLOWED;
 237             logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
 238         }
 239
 240         Query fullQuery = expandQuery();
 241
 242         logger.info("final query: " + fullQuery.toString());
 243
 244         int offset = pageNumber * pageSize;
 245         int limit = (pageNumber + 1) * pageSize - 1 ;
 246
 247         logger.debug("start: " + offset + "; limit:" + limit);
 248
 249 //        TopDocs topDocs = null;
 250
 251         // sort must be non null default: Sort.RELEVANCE
 252         Sort groupSort = null;
 253         Sort withinGroupSort = Sort.RELEVANCE;
 254         if(sortFields != null && sortFields.length > 0){
 255             Sort sort = new Sort(sortFields);
 256             groupSort = new Sort(sortFields);
 257 //            topDocs = getSearcher().search(fullQuery, null, limit, sort);
 258         } else {
 259             groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
 260 //            topDocs = getSearcher().search(fullQuery, null, limit);
 261         }
 262         FirstPassGroupingCollector groupingCollector_1 = new FirstPassGroupingCollector(GROUP_BY_FIELD, withinGroupSort, limit);
 263         getSearcher().search(fullQuery, groupingCollector_1);
 264
 265         Collection<SearchGroup> topGroups = groupingCollector_1.getTopGroups(offset, true);
 266
 267         if (topGroups == null) {
 268               return null;
 269         }
 270
 271         boolean getScores = true;
 272         boolean getMaxScores = true;
 273         boolean fillFields = true;
 274         AllGroupsCollector c3 = new AllGroupsCollector(GROUP_BY_FIELD);
 275         SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(GROUP_BY_FIELD, topGroups, groupSort, withinGroupSort, limit, getScores, getMaxScores, fillFields);
 276         getSearcher().search(fullQuery, MultiCollector.wrap(c2, c3));
 277
 278         TopGroups groupsResult = c2.getTopGroups(offset);
 279         groupsResult = new TopGroups(groupsResult, c3.getGroupCount());
 280
 281         return groupsResult;
 282
 283
 284         //TODO when switched to Lucene 3.x which is included in hibernate 4.x
 285         //     use TopDocCollector.topDocs(int start, int howMany);
 286         //     since this method might be more memory save than our own implementation
 287         //
 288         //     ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
 289         //
 290 //        TopDocs topDocs = hitCollector.topDocs();
 291 //        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
 292
 293 //        int docsAvailableInPage = Math.min(scoreDocs.length - offset, pageSize);
 294 //        logger.debug("docsAvailableInPage:" + docsAvailableInPage);
 295 //
 296 //        ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
 297 //        for(int i = 0; i < docsAvailableInPage; i++){
 298 //            pagedDocs[i] = scoreDocs[offset + i];
 299 //        }
 300 //        TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
 301         //
 302         /////////////////////////////////////////////
 303
 304 //        return pagedTopDocs;
 305     }
 306
 307     /**
 308      * @param clazz
 309      */
 310     protected Query expandQuery() {
 311         Query fullQuery;
 312         if(clazz != null){
 313             BooleanQuery filteredQuery = new BooleanQuery();
 314             BooleanQuery classFilter = new BooleanQuery();
 315
 316             Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
 317             TermQuery termQuery = new TermQuery(t);
 318
 319             classFilter.setBoost(0);
 320             classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
 321
 322             filteredQuery.add(this.query, BooleanClause.Occur.MUST);
 323             filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
 324
 325             fullQuery = filteredQuery;
 326         } else {
 327             fullQuery = this.query;
 328         }
 329         return fullQuery;
 330     }
 331
 332     public void setQuery(Query query) {
 333         this.query = query;
 334     }
 335
 336     public Query getQuery() {
 337         return query;
 338     }
 339
 340     public Query getExpandedQuery() {
 341         expandQuery();
 342         return query;
 343     }
 344
 345     public SortField[] getSortFields() {
 346         return sortFields;
 347     }
 348
 349     public void setSortFields(SortField[] sortFields) {
 350         this.sortFields = sortFields;
 351     }
 352
 353     public void setHighlightFields(String[] textFieldNamesAsArray) {
 354         this.highlightFields = textFieldNamesAsArray;
 355
 356     }
 357
 358     public String[] getHighlightFields() {
 359         return this.highlightFields;
 360     }
 361
 362 }