cleanup
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
1 /**
2 * Copyright (C) 2011 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.api.service.search;
10
11 import java.io.IOException;
12 import java.util.Arrays;
13 import java.util.Collection;
14
15 import org.apache.logging.log4j.LogManager;
16 import org.apache.logging.log4j.Logger;
17 import org.apache.lucene.analysis.Analyzer;
18 import org.apache.lucene.queryparser.classic.ParseException;
19 import org.apache.lucene.search.BooleanClause.Occur;
20 import org.apache.lucene.search.BooleanQuery;
21 import org.apache.lucene.search.BooleanQuery.Builder;
22 import org.apache.lucene.search.IndexSearcher;
23 import org.apache.lucene.search.MultiCollector;
24 import org.apache.lucene.search.Query;
25 import org.apache.lucene.search.Sort;
26 import org.apache.lucene.search.SortField;
27 import org.apache.lucene.search.TopDocs;
28 import org.apache.lucene.search.grouping.GroupDocs;
29 import org.apache.lucene.search.grouping.SearchGroup;
30 import org.apache.lucene.search.grouping.TopGroups;
31 import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
32 import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
33 import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
34 import org.apache.lucene.util.BytesRef;
35
36 import eu.etaxonomy.cdm.model.common.CdmBase;
37 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38 import eu.etaxonomy.cdm.model.description.TextData;
39 import eu.etaxonomy.cdm.model.taxon.Taxon;
40 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
41
42 /**
43 * @author Andreas Kohlbecker
44 * @since Dec 21, 2011
45 */
46 public class LuceneSearch {
47
48 private static final Logger logger = LogManager.getLogger();
49
50 public final static String ID_FIELD = "id";
51
52 /**
53 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
54 * otherwise PriorityQueue will produce an exception since it
55 * will always add 1 to the maxhits so Integer.MAX_VALUE
56 * would become Integer.MIN_VALUE
57 */
58 public final int MAX_HITS_ALLOWED = 10000;
59
60
61 protected String groupByField = "id";
62
63 protected ILuceneIndexToolProvider toolProvider;
64
65 protected IndexSearcher searcher;
66
67 protected SortField[] sortFields;
68
69 private Class<? extends CdmBase> directorySelectClass;
70
71 private BooleanQuery filter = null;
72
73 //class filter
74 protected Class<? extends CdmBase> cdmTypeRestriction;
75
76 protected BooleanQuery query;
77
78 protected String[] highlightFields = new String[0];
79
80 private int maxDocsPerGroup = 10;
81
82 protected Class<? extends CdmBase> getDirectorySelectClass() {
83 return pushAbstractBaseTypeDown(directorySelectClass);
84 }
85
86 public Class<? extends CdmBase> getCdmTypRestriction() {
87 return cdmTypeRestriction;
88 }
89
90 //filter
91 public BooleanQuery getFilter() {
92 return filter;
93 }
94 public void setFilter(BooleanQuery filter) {
95 this.filter = filter;
96 }
97
98 /**
99 * Sets the Class to use as filter criterion, in case the supplied Class equals the
100 * <code>directorySelectClass</code> the Class is set to <code>null</code>
101 * @param clazz
102 */
103 public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
104
105 /*
106 * NOTE:
107 * we must not use the getter of directorySelectClass
108 * since we need the abstract base classes here!!!!
109 */
110 if(clazz != null && clazz.equals(directorySelectClass)){
111 clazz = null;
112 }
113 this.cdmTypeRestriction = clazz;
114 }
115
116 public int getMaxDocsPerGroup() {
117 return maxDocsPerGroup;
118 }
119 public void setMaxDocsPerGroup(int maxDocsPerGroup) {
120 this.maxDocsPerGroup = maxDocsPerGroup;
121 }
122
123 public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
124 this.toolProvider = toolProvider;
125 this.directorySelectClass = directorySelectClass;
126 }
127 public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
128 this.toolProvider = toolProvider;
129 this.directorySelectClass = directorySelectClass;
130 this.groupByField = groupByField;
131 }
132
133 /**
134 * TODO the abstract base class DescriptionElementBase can not be used, so
135 * we are using an arbitrary subclass to find the DirectoryProvider, future
136 * versions of hibernate search my allow using abstract base classes see
137 * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
138 *
139 * @param type must not be null
140 * @return
141 */
142 private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
143 Class<? extends CdmBase> returnType = type;
144 if (type.equals(DescriptionElementBase.class)) {
145 returnType = TextData.class;
146 }
147 if (type.equals(TaxonBase.class)) {
148 returnType = Taxon.class;
149 }
150 // if (type.equals(TaxonName.class)) {
151 // returnType = NonViralName.class;
152 // }
153 return returnType;
154 }
155
156 protected LuceneSearch() {
157
158 }
159
160 public IndexSearcher getSearcher() {
161 if(searcher == null){
162 searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
163 // searcher.setDefaultFieldSortScoring(true, true);
164 }
165 return searcher;
166 }
167
168 /**
169 * Convenience method which delegated the call to the available
170 * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
171 *
172 * @return the Analyzer suitable for the <code>directorySelectClass</code>
173 * of the LuceneSearch
174 */
175 public Analyzer getAnalyzer() {
176 return toolProvider.getAnalyzerFor(directorySelectClass);
177 }
178
179 /**
180 * @param luceneQueryString
181 * @param cdmTypeRestriction the type as additional filter criterion
182 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
183 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
184 * @return
185 * @throws ParseException
186 * @throws IOException
187 */
188 public TopGroups<BytesRef> executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
189
190 Query luceneQuery = parse(luceneQueryString);
191 setQuery(luceneQuery);
192 return executeSearch(pageSize, pageNumber);
193 }
194
195 /**
196 * @param luceneQueryString
197 * @return
198 * @throws ParseException
199 */
200 public Query parse(String luceneQueryString) throws ParseException {
201 logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
202 Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass, false).parse(luceneQueryString);
203 return luceneQuery;
204 }
205
206 /**
207 * @param maxNoOfHits
208 * @return
209 * @throws IOException
210 */
211 public TopDocs executeSearch(int maxNoOfHits) throws IOException {
212 BooleanQuery fullQuery = expandQuery();
213 logger.info("lucene query string to be parsed: " + fullQuery.toString());
214 return getSearcher().search(fullQuery, maxNoOfHits, Sort.RELEVANCE, true, true);
215
216 }
217 /**
218 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
219 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
220 * @return
221 * @throws ParseException
222 * @throws IOException
223 */
224 public TopGroups<BytesRef> executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
225
226 if(pageNumber == null || pageNumber < 0){
227 pageNumber = 0;
228 }
229 if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
230 pageSize = MAX_HITS_ALLOWED;
231 logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
232 }
233
234 BooleanQuery fullQuery = expandQuery();
235 logger.info("final query: " + fullQuery.toString());
236
237 int offset = pageNumber * pageSize;
238 int limit = (pageNumber + 1) * pageSize;
239 logger.debug("start: " + offset + "; limit:" + limit);
240
241 // sorting
242 Sort groupSort = null;
243 Sort withinGroupSort = Sort.RELEVANCE;
244 if(sortFields != null && sortFields.length > 0){
245 groupSort = new Sort(sortFields);
246 } else {
247 groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
248 }
249
250 // perform the search (needs two passes for grouping)
251 if(logger.isDebugEnabled()){
252 logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
253 ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
254 }
255 // - first pass
256 TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, groupSort, limit);
257
258 getSearcher().search(fullQuery, firstPassCollector);
259 Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
260
261 if (topGroups == null) {
262 return null;
263 }
264 // - flags for second pass
265 boolean getScores = false;
266 boolean getMaxScores = true;
267 if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
268 getMaxScores = false;
269 // see inner class TopGroupsWithMaxScore
270 logger.warn("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
271 }
272 boolean fillFields = true;
273 TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
274 TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
275 groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
276 getMaxScores, fillFields
277 );
278 getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
279
280 TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
281
282 // --- set the max score for the group results
283
284 // get max score from very first result
285 float maxScore = groupsResult.groups[0].maxScore;
286
287 if(logger.isDebugEnabled()){
288 logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
289 ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
290 ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
291 }
292
293 TopGroups<BytesRef> newTopGroups;
294 if(offset > 0){
295 GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[groupsResult.groups.length - offset];
296 for(int i = offset; i < groupsResult.groups.length; i++){
297 newGroupDocs[i - offset] = groupsResult.groups[i];
298 }
299 newTopGroups = new TopGroups<BytesRef>(
300 groupsResult.groupSort,
301 groupsResult.withinGroupSort,
302 groupsResult.totalHitCount,
303 groupsResult.totalGroupedHitCount,
304 newGroupDocs,
305 maxScore);
306 } else {
307 newTopGroups = groupsResult;
308 }
309 TopGroups<BytesRef> topGroupsWithMaxScore = new TopGroups<>(newTopGroups, allGroupsCollector.getGroupCount());
310 // --- done with max score for the group results
311
312 return topGroupsWithMaxScore;
313 }
314
315 /**
316 * expands the query by adding a type restriction if the
317 * <code>cdmTypeRestriction</code> is not <code>NULL</code>
318 * and adds the <code>filter</code> as Boolean query
319 * clause with {@link Occur#FILTER}
320 */
321 protected BooleanQuery expandQuery() {
322 BooleanQuery fullQuery = null;
323 Builder fullQueryBuilder = null;
324
325 if(cdmTypeRestriction != null){
326 fullQueryBuilder = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
327 }
328
329 if(filter != null) {
330 if(fullQueryBuilder == null) {
331 fullQueryBuilder = new Builder();
332 fullQueryBuilder.add(this.query, Occur.MUST);
333 }
334 fullQueryBuilder.add(filter, Occur.FILTER);
335 }
336
337 if(fullQueryBuilder != null) {
338 fullQuery = fullQueryBuilder.build();
339 } else {
340 fullQuery = this.query;
341 }
342
343 logger.debug("expandedQuery: " + fullQuery.toString());
344 return fullQuery;
345 }
346
347 public void setQuery(Query query) {
348 if( query instanceof BooleanQuery) {
349 this.query = (BooleanQuery)query;
350 } else {
351 Builder builder = new Builder();
352 this.query = builder.add(query, Occur.MUST).build();
353 }
354 }
355
356 public BooleanQuery getQuery() {
357 return query;
358 }
359
360 public BooleanQuery getExpandedQuery() {
361 expandQuery();
362 return query;
363 }
364
365 public SortField[] getSortFields() {
366 return sortFields;
367 }
368
369 public void setSortFields(SortField[] sortFields) {
370 this.sortFields = sortFields;
371 }
372
373 public void setHighlightFields(String[] textFieldNamesAsArray) {
374 this.highlightFields = textFieldNamesAsArray;
375 }
376
377 public String[] getHighlightFields() {
378 return this.highlightFields;
379 }
380
381 }