fixing #3116 (fulltext search: always only one page of results)
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
1 // $Id$
2 /**
3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13
14 import org.apache.log4j.Logger;
15 import org.apache.lucene.analysis.Analyzer;
16 import org.apache.lucene.index.IndexReader;
17 import org.apache.lucene.index.Term;
18 import org.apache.lucene.queryParser.ParseException;
19 import org.apache.lucene.queryParser.QueryParser;
20 import org.apache.lucene.search.BooleanClause;
21 import org.apache.lucene.search.BooleanQuery;
22 import org.apache.lucene.search.Hits;
23 import org.apache.lucene.search.IndexSearcher;
24 import org.apache.lucene.search.Query;
25 import org.apache.lucene.search.ScoreDoc;
26 import org.apache.lucene.search.Searcher;
27 import org.apache.lucene.search.Sort;
28 import org.apache.lucene.search.SortField;
29 import org.apache.lucene.search.TermQuery;
30 import org.apache.lucene.search.TopDocs;
31 import org.hibernate.Session;
32 import org.hibernate.search.Search;
33 import org.hibernate.search.SearchFactory;
34 import org.hibernate.search.engine.DocumentBuilder;
35 import org.hibernate.search.reader.ReaderProvider;
36 import org.hibernate.search.store.DirectoryProvider;
37
38 import eu.etaxonomy.cdm.model.common.CdmBase;
39 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
40 import eu.etaxonomy.cdm.model.description.TextData;
41 import eu.etaxonomy.cdm.model.taxon.Taxon;
42 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
43
44 /**
45 *
46 * @author Andreas Kohlbecker
47 * @date Dec 21, 2011
48 *
49 */
50 public class LuceneSearch {
51
52 public static final Logger logger = Logger.getLogger(LuceneSearch.class);
53
54 protected Session session;
55
56 protected Searcher searcher;
57
58 private SortField[] sortFields;
59
60 private Class<? extends CdmBase> directorySelectClass;
61
62 protected Class<? extends CdmBase> getDirectorySelectClass() {
63 return pushAbstractBaseTypeDown(directorySelectClass);
64 }
65
66 /**
67 * classFilter
68 */
69 private Class<? extends CdmBase> clazz;
70
71
72 public Class<? extends CdmBase> getClazz() {
73 return clazz;
74 }
75
76 /**
77 * Sets the Class to use as filter criterion, in case the supplied Class equals the
78 * <code>directorySelectClass</code> the Class is set to <code>null</code>
79 * @param clazz
80 */
81 public void setClazz(Class<? extends CdmBase> clazz) {
82
83 /*
84 * NOTE:
85 * we must not use the getter of directorySelectClass
86 * since we need the abstract base classes here!!!!
87 */
88 if(clazz != null && clazz.equals(directorySelectClass)){
89 clazz = null;
90 }
91 this.clazz = clazz;
92 }
93
94 /**
95 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
96 * otherwise PriorityQueue will produce an exception since it
97 * will always add 1 to the maxhits so Integer.MAX_VALUE
98 * would become Integer.MIN_VALUE
99 */
100 public final int MAX_HITS_ALLOWED = 10000;
101
102 protected Query query;
103
104 protected String[] highlightFields = new String[0];
105
106
107 /**
108 * @param session
109 */
110 public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
111 this.session = session;
112 this.directorySelectClass = directorySelectClass;
113 }
114
115 /**
116 * TODO the abstract base class DescriptionElementBase can not be used, so
117 * we are using an arbitraty subclass to find the DirectoryProvider, future
118 * versions of hibernate search my allow using abstract base classes see
119 * http
120 * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
121 * -a-given-class-in-java
122 *
123 * @param type must not be null
124 * @return
125 */
126 protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
127 if (type.equals(DescriptionElementBase.class)) {
128 type = TextData.class;
129 }
130 if (type.equals(TaxonBase.class)) {
131 type = Taxon.class;
132 }
133 return type;
134 }
135
136 protected LuceneSearch() {
137
138 }
139
140 /**
141 * @return
142 */
143 public Searcher getSearcher() {
144 if(searcher == null){
145 searcher = new IndexSearcher(getIndexReader());
146 }
147 return searcher;
148 }
149
150 /**
151 * @return
152 */
153 public IndexReader getIndexReader() {
154 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
155
156 DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
157 logger.info(directoryProviders[0].getDirectory().toString());
158
159 ReaderProvider readerProvider = searchFactory.getReaderProvider();
160 IndexReader reader = readerProvider.openReader(directoryProviders[0]);
161 return reader;
162 }
163
164 /**
165 * @return
166 */
167 public QueryParser getQueryParser() {
168 Analyzer analyzer = getAnalyzer();
169 QueryParser parser = new QueryParser("titleCache", analyzer);
170 return parser;
171 }
172
173 /**
174 * @return
175 */
176 public Analyzer getAnalyzer() {
177 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
178 Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
179 return analyzer;
180 }
181
182 /**
183 * @param luceneQueryString
184 * @param clazz the type as additional filter criterion
185 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
186 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
187 * @return
188 * @throws ParseException
189 * @throws IOException
190 */
191 public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
192
193 Query luceneQuery = parse(luceneQueryString);
194 this.query = luceneQuery;
195
196 return executeSearch(pageSize, pageNumber);
197 }
198
199 /**
200 * @param luceneQueryString
201 * @return
202 * @throws ParseException
203 */
204 public Query parse(String luceneQueryString) throws ParseException {
205 logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
206 Query luceneQuery = getQueryParser().parse(luceneQueryString);
207 return luceneQuery;
208 }
209
210 /**
211 * @param luceneQuery
212 * @param clazz the type as additional filter criterion
213 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
214 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
215 * @return
216 * @throws ParseException
217 * @throws IOException
218 */
219 public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
220
221
222 if(pageNumber == null || pageNumber < 0){
223 pageNumber = 0;
224 }
225 if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
226 pageSize = MAX_HITS_ALLOWED;
227 logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
228 }
229
230 Query fullQuery = expandQuery();
231
232 logger.info("final query: " + fullQuery.toString());
233
234 int start = pageNumber * pageSize;
235 int limit = (pageNumber + 1) * pageSize - 1 ;
236
237 logger.debug("start: " + start + "; limit:" + limit);
238
239 TopDocs topDocs;
240 if(sortFields != null && sortFields.length > 0){
241 Sort sort = new Sort(sortFields);
242 topDocs = getSearcher().search(fullQuery, null, limit, sort);
243 } else {
244 topDocs = getSearcher().search(fullQuery, null, limit);
245 }
246
247
248 //TODO when switched to Lucene 3.x which is included in hibernate 4.x
249 // use TopDocCollector.topDocs(int start, int howMany);
250 // since this method might be more memory save than our own implementation
251 //
252 // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
253 //
254 // TopDocs topDocs = hitCollector.topDocs();
255 ScoreDoc[] scoreDocs = topDocs.scoreDocs;
256
257 int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
258 logger.debug("docsAvailableInPage:" + docsAvailableInPage);
259
260 ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
261 for(int i = 0; i < docsAvailableInPage; i++){
262 pagedDocs[i] = scoreDocs[start + i];
263 }
264 TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
265 //
266 /////////////////////////////////////////////
267
268 return pagedTopDocs;
269 }
270
271 /**
272 * @param clazz
273 */
274 protected Query expandQuery() {
275 Query fullQuery;
276 if(clazz != null){
277 BooleanQuery filteredQuery = new BooleanQuery();
278 BooleanQuery classFilter = new BooleanQuery();
279
280 Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
281 TermQuery termQuery = new TermQuery(t);
282
283 classFilter.setBoost(0);
284 classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
285
286 filteredQuery.add(this.query, BooleanClause.Occur.MUST);
287 filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
288
289 fullQuery = filteredQuery;
290 } else {
291 fullQuery = this.query;
292 }
293 return fullQuery;
294 }
295
296 public void setQuery(Query query) {
297 this.query = query;
298 }
299
300 public Query getQuery() {
301 return query;
302 }
303
304 public Query getExpandedQuery() {
305 expandQuery();
306 return query;
307 }
308
309 public SortField[] getSortFields() {
310 return sortFields;
311 }
312
313 public void setSortFields(SortField[] sortFields) {
314 this.sortFields = sortFields;
315 }
316
317 public void setHighlightFields(String[] textFieldNamesAsArray) {
318 this.highlightFields = textFieldNamesAsArray;
319
320 }
321
322 public String[] getHighlightFields() {
323 return this.highlightFields;
324 }
325
326 }