cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2011 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12 import java.io.IOException;
  13
  14 import org.apache.log4j.Logger;
  15 import org.apache.lucene.analysis.Analyzer;
  16 import org.apache.lucene.index.IndexReader;
  17 import org.apache.lucene.index.Term;
  18 import org.apache.lucene.queryParser.ParseException;
  19 import org.apache.lucene.queryParser.QueryParser;
  20 import org.apache.lucene.search.BooleanClause;
  21 import org.apache.lucene.search.BooleanQuery;
  22 import org.apache.lucene.search.Hits;
  23 import org.apache.lucene.search.IndexSearcher;
  24 import org.apache.lucene.search.Query;
  25 import org.apache.lucene.search.ScoreDoc;
  26 import org.apache.lucene.search.Searcher;
  27 import org.apache.lucene.search.Sort;
  28 import org.apache.lucene.search.SortField;
  29 import org.apache.lucene.search.TermQuery;
  30 import org.apache.lucene.search.TopDocs;
  31 import org.hibernate.Session;
  32 import org.hibernate.search.Search;
  33 import org.hibernate.search.SearchFactory;
  34 import org.hibernate.search.engine.DocumentBuilder;
  35 import org.hibernate.search.reader.ReaderProvider;
  36 import org.hibernate.search.store.DirectoryProvider;
  37
  38 import eu.etaxonomy.cdm.model.common.CdmBase;
  39 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  40 import eu.etaxonomy.cdm.model.description.TextData;
  41 import eu.etaxonomy.cdm.model.taxon.Taxon;
  42 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  43
  44 /**
  45  *
  46  * @author Andreas Kohlbecker
  47  * @date Dec 21, 2011
  48  *
  49  */
  50 public class LuceneSearch {
  51
  52     public static final Logger logger = Logger.getLogger(LuceneSearch.class);
  53
  54     protected Session session;
  55
  56     protected Searcher searcher;
  57
  58     private SortField[] sortFields;
  59
  60     private Class<? extends CdmBase> directorySelectClass;
  61
  62     protected Class<? extends CdmBase> getDirectorySelectClass() {
  63         return pushAbstractBaseTypeDown(directorySelectClass);
  64     }
  65
  66     /**
  67      * classFilter
  68      */
  69     private Class<? extends CdmBase> clazz;
  70
  71
  72     public Class<? extends CdmBase> getClazz() {
  73         return clazz;
  74     }
  75
  76     /**
  77      * Sets the Class to use as filter criterion, in case the supplied Class equals the
  78      * <code>directorySelectClass</code> the Class is set to <code>null</code>
  79      * @param clazz
  80      */
  81     public void setClazz(Class<? extends CdmBase> clazz) {
  82
  83         /*
  84          * NOTE:
  85          * we must not use the getter of directorySelectClass
  86          * since we need the abstract base classes here!!!!
  87          */
  88         if(clazz != null && clazz.equals(directorySelectClass)){
  89             clazz = null;
  90         }
  91         this.clazz = clazz;
  92     }
  93
  94     /**
  95      * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
  96      * otherwise PriorityQueue will produce an exception since it
  97      * will always add 1 to the maxhits so Integer.MAX_VALUE
  98      * would become Integer.MIN_VALUE
  99      */
 100     public final int MAX_HITS_ALLOWED = 10000;
 101
 102     protected Query query;
 103
 104     protected String[] highlightFields = new String[0];
 105
 106
 107     /**
 108      * @param session
 109      */
 110     public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
 111          this.session = session;
 112          this.directorySelectClass = directorySelectClass;
 113     }
 114
 115     /**
 116      * TODO the abstract base class DescriptionElementBase can not be used, so
 117      * we are using an arbitraty subclass to find the DirectoryProvider, future
 118      * versions of hibernate search my allow using abstract base classes see
 119      * http
 120      * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
 121      * -a-given-class-in-java
 122      *
 123      * @param type must not be null
 124      * @return
 125      */
 126     protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
 127         if (type.equals(DescriptionElementBase.class)) {
 128             type = TextData.class;
 129         }
 130         if (type.equals(TaxonBase.class)) {
 131             type = Taxon.class;
 132         }
 133         return type;
 134     }
 135
 136     protected LuceneSearch() {
 137
 138     }
 139
 140     /**
 141      * @return
 142      */
 143     public Searcher getSearcher() {
 144         if(searcher == null){
 145             searcher = new IndexSearcher(getIndexReader());
 146         }
 147         return searcher;
 148     }
 149
 150     /**
 151      * @return
 152      */
 153     public IndexReader getIndexReader() {
 154         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 155
 156         DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
 157         logger.info(directoryProviders[0].getDirectory().toString());
 158
 159         ReaderProvider readerProvider = searchFactory.getReaderProvider();
 160         IndexReader reader = readerProvider.openReader(directoryProviders[0]);
 161         return reader;
 162     }
 163
 164     /**
 165      * @return
 166      */
 167     public QueryParser getQueryParser() {
 168         Analyzer analyzer = getAnalyzer();
 169         QueryParser parser = new QueryParser("titleCache", analyzer);
 170         return parser;
 171     }
 172
 173     /**
 174      * @return
 175      */
 176     public Analyzer getAnalyzer() {
 177         SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
 178         Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
 179         return analyzer;
 180     }
 181
 182     /**
 183      * @param luceneQueryString
 184      * @param clazz the type as additional filter criterion
 185      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 186      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 187      * @return
 188      * @throws ParseException
 189      * @throws IOException
 190      */
 191     public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 192
 193         Query luceneQuery = parse(luceneQueryString);
 194         this.query = luceneQuery;
 195
 196         return executeSearch(pageSize, pageNumber);
 197     }
 198
 199     /**
 200      * @param luceneQueryString
 201      * @return
 202      * @throws ParseException
 203      */
 204     public Query parse(String luceneQueryString) throws ParseException {
 205         logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
 206         Query luceneQuery = getQueryParser().parse(luceneQueryString);
 207         return luceneQuery;
 208     }
 209
 210     /**
 211      * @param luceneQuery
 212      * @param clazz the type as additional filter criterion
 213      * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
 214      * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
 215      * @return
 216      * @throws ParseException
 217      * @throws IOException
 218      */
 219     public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
 220
 221
 222         if(pageNumber == null || pageNumber < 0){
 223             pageNumber = 0;
 224         }
 225         if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
 226             pageSize = MAX_HITS_ALLOWED;
 227             logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
 228         }
 229
 230         Query fullQuery = expandQuery();
 231
 232         logger.info("final query: " + fullQuery.toString());
 233
 234         int start = pageNumber * pageSize;
 235         int limit = (pageNumber + 1) * pageSize - 1 ;
 236
 237         logger.debug("start: " + start + "; limit:" + limit);
 238
 239         TopDocs topDocs;
 240         if(sortFields != null && sortFields.length > 0){
 241             Sort sort = new Sort(sortFields);
 242             topDocs = getSearcher().search(fullQuery, null, limit, sort);
 243         } else {
 244             topDocs = getSearcher().search(fullQuery, null, limit);
 245         }
 246
 247
 248         //TODO when switched to Lucene 3.x which is included in hibernate 4.x
 249         //     use TopDocCollector.topDocs(int start, int howMany);
 250         //     since this method might be more memory save than our own implementation
 251         //
 252         //     ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
 253         //
 254 //        TopDocs topDocs = hitCollector.topDocs();
 255         ScoreDoc[] scoreDocs = topDocs.scoreDocs;
 256
 257         int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
 258         logger.debug("docsAvailableInPage:" + docsAvailableInPage);
 259
 260         ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
 261         for(int i = 0; i < docsAvailableInPage; i++){
 262             pagedDocs[i] = scoreDocs[start + i];
 263         }
 264         TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
 265         //
 266         /////////////////////////////////////////////
 267
 268         return pagedTopDocs;
 269     }
 270
 271     /**
 272      * @param clazz
 273      */
 274     protected Query expandQuery() {
 275         Query fullQuery;
 276         if(clazz != null){
 277             BooleanQuery filteredQuery = new BooleanQuery();
 278             BooleanQuery classFilter = new BooleanQuery();
 279
 280             Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
 281             TermQuery termQuery = new TermQuery(t);
 282
 283             classFilter.setBoost(0);
 284             classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
 285
 286             filteredQuery.add(this.query, BooleanClause.Occur.MUST);
 287             filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
 288
 289             fullQuery = filteredQuery;
 290         } else {
 291             fullQuery = this.query;
 292         }
 293         return fullQuery;
 294     }
 295
 296     public void setQuery(Query query) {
 297         this.query = query;
 298     }
 299
 300     public Query getQuery() {
 301         return query;
 302     }
 303
 304     public Query getExpandedQuery() {
 305         expandQuery();
 306         return query;
 307     }
 308
 309     public SortField[] getSortFields() {
 310         return sortFields;
 311     }
 312
 313     public void setSortFields(SortField[] sortFields) {
 314         this.sortFields = sortFields;
 315     }
 316
 317     public void setHighlightFields(String[] textFieldNamesAsArray) {
 318         this.highlightFields = textFieldNamesAsArray;
 319
 320     }
 321
 322     public String[] getHighlightFields() {
 323         return this.highlightFields;
 324     }
 325
 326 }