filtering of lucene searches by distributions basically implemented, needs to be...
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
1 // $Id$
2 /**
3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13 import java.util.Collection;
14
15 import org.apache.log4j.Logger;
16 import org.apache.lucene.analysis.Analyzer;
17 import org.apache.lucene.index.IndexReader;
18 import org.apache.lucene.index.Term;
19 import org.apache.lucene.queryParser.ParseException;
20 import org.apache.lucene.queryParser.QueryParser;
21 import org.apache.lucene.search.BooleanClause;
22 import org.apache.lucene.search.BooleanQuery;
23 import org.apache.lucene.search.Filter;
24 import org.apache.lucene.search.IndexSearcher;
25 import org.apache.lucene.search.MultiCollector;
26 import org.apache.lucene.search.Query;
27 import org.apache.lucene.search.Sort;
28 import org.apache.lucene.search.SortField;
29 import org.apache.lucene.search.TermQuery;
30 import org.apache.lucene.search.TopDocs;
31 import org.apache.lucene.search.grouping.GroupDocs;
32 import org.apache.lucene.search.grouping.SearchGroup;
33 import org.apache.lucene.search.grouping.TermAllGroupsCollector;
34 import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
35 import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
36 import org.apache.lucene.search.grouping.TopGroups;
37 import org.hibernate.Session;
38 import org.hibernate.search.ProjectionConstants;
39 import org.hibernate.search.Search;
40 import org.hibernate.search.SearchFactory;
41
42 import eu.etaxonomy.cdm.config.Configuration;
43 import eu.etaxonomy.cdm.model.common.CdmBase;
44 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
45 import eu.etaxonomy.cdm.model.description.TextData;
46 import eu.etaxonomy.cdm.model.name.NonViralName;
47 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
48 import eu.etaxonomy.cdm.model.taxon.Taxon;
49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
50
51 /**
52 *
53 * @author Andreas Kohlbecker
54 * @date Dec 21, 2011
55 *
56 */
57 public class LuceneSearch {
58
59 protected String groupByField = "id";
60
61 public final static String ID_FIELD = "id";
62
63 public static final Logger logger = Logger.getLogger(LuceneSearch.class);
64
65 protected Session session;
66
67 protected IndexSearcher searcher;
68
69 protected SortField[] sortFields;
70
71 private Class<? extends CdmBase> directorySelectClass;
72
73 private Filter filter = null;
74
75 protected Class<? extends CdmBase> getDirectorySelectClass() {
76 return pushAbstractBaseTypeDown(directorySelectClass);
77 }
78
79 /**
80 * classFilter
81 */
82 protected Class<? extends CdmBase> clazz;
83
84
85 public Class<? extends CdmBase> getClazz() {
86 return clazz;
87 }
88
89 /**
90 * @return the filter
91 */
92 public Filter getFilter() {
93 return filter;
94 }
95
96 /**
97 * @param filter the filter to set
98 */
99 public void setFilter(Filter filter) {
100 this.filter = filter;
101 }
102
103 /**
104 * Sets the Class to use as filter criterion, in case the supplied Class equals the
105 * <code>directorySelectClass</code> the Class is set to <code>null</code>
106 * @param clazz
107 */
108 public void setClazz(Class<? extends CdmBase> clazz) {
109
110 /*
111 * NOTE:
112 * we must not use the getter of directorySelectClass
113 * since we need the abstract base classes here!!!!
114 */
115 if(clazz != null && clazz.equals(directorySelectClass)){
116 clazz = null;
117 }
118 this.clazz = clazz;
119 }
120
121 /**
122 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
123 * otherwise PriorityQueue will produce an exception since it
124 * will always add 1 to the maxhits so Integer.MAX_VALUE
125 * would become Integer.MIN_VALUE
126 */
127 public final int MAX_HITS_ALLOWED = 10000;
128
129 protected Query query;
130
131 protected String[] highlightFields = new String[0];
132
133 private int maxDocsPerGroup = 10;
134
135
136 public int getMaxDocsPerGroup() {
137 return maxDocsPerGroup;
138 }
139
140 public void setMaxDocsPerGroup(int maxDocsPerGroup) {
141 this.maxDocsPerGroup = maxDocsPerGroup;
142 }
143
144 /**
145 * @param session
146 */
147 public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
148 this.session = session;
149 this.directorySelectClass = directorySelectClass;
150 }
151
152 /**
153 * @param session
154 */
155 public LuceneSearch(Session session, String groupByField, Class<? extends CdmBase> directorySelectClass) {
156 this.session = session;
157 this.directorySelectClass = directorySelectClass;
158 this.groupByField = groupByField;
159 }
160
161 /**
162 * TODO the abstract base class DescriptionElementBase can not be used, so
163 * we are using an arbitraty subclass to find the DirectoryProvider, future
164 * versions of hibernate search my allow using abstract base classes see
165 * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
166 *
167 * @param type must not be null
168 * @return
169 */
170 protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
171 if (type.equals(DescriptionElementBase.class)) {
172 type = TextData.class;
173 }
174 if (type.equals(TaxonBase.class)) {
175 type = Taxon.class;
176 }
177 if (type.equals(TaxonNameBase.class)) {
178 type = NonViralName.class;
179 }
180 return type;
181 }
182
183 protected LuceneSearch() {
184
185 }
186
187 /**
188 * @return
189 */
190 public IndexSearcher getSearcher() {
191 if(searcher == null){
192 searcher = new IndexSearcher(getIndexReader());
193 searcher.setDefaultFieldSortScoring(true, true);
194 }
195 return searcher;
196 }
197
198 /**
199 * @return
200 */
201 public IndexReader getIndexReader() {
202 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
203 IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
204 return reader;
205 }
206
207 /**
208 * @return
209 */
210 public IndexReader getIndexReaderFor(Class<? extends CdmBase> clazz) {
211 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
212 IndexReader reader = searchFactory.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz));
213 return reader;
214 }
215
216 /**
217 * @return
218 */
219 public QueryParser getQueryParser() {
220 Analyzer analyzer = getAnalyzer();
221 QueryParser parser = new QueryParser(Configuration.luceneVersion, "titleCache", analyzer);
222 return parser;
223 }
224
225 /**
226 * @return
227 */
228 public Analyzer getAnalyzer() {
229 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
230 Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
231 return analyzer;
232 }
233
234 /**
235 * @param luceneQueryString
236 * @param clazz the type as additional filter criterion
237 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
238 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
239 * @return
240 * @throws ParseException
241 * @throws IOException
242 */
243 public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
244
245 Query luceneQuery = parse(luceneQueryString);
246 this.query = luceneQuery;
247
248 return executeSearch(pageSize, pageNumber);
249 }
250
251 /**
252 * @param luceneQueryString
253 * @return
254 * @throws ParseException
255 */
256 public Query parse(String luceneQueryString) throws ParseException {
257 logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
258 Query luceneQuery = getQueryParser().parse(luceneQueryString);
259 return luceneQuery;
260 }
261
262 /**
263 * @param maxNoOfHits
264 * @return
265 * @throws IOException
266 */
267 public TopDocs executeSearch(int maxNoOfHits) throws IOException {
268 Query fullQuery = expandQuery();
269 logger.info("lucene query string to be parsed: " + fullQuery.toString());
270 return getSearcher().search(fullQuery, filter, maxNoOfHits);
271
272 }
273 /**
274 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
275 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
276 * @return
277 * @throws ParseException
278 * @throws IOException
279 */
280 public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
281
282
283 if(pageNumber == null || pageNumber < 0){
284 pageNumber = 0;
285 }
286 if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
287 pageSize = MAX_HITS_ALLOWED;
288 logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
289 }
290
291 Query fullQuery = expandQuery();
292 logger.info("final query: " + fullQuery.toString());
293
294 int offset = pageNumber * pageSize;
295 int limit = (pageNumber + 1) * pageSize - 1 ;
296 logger.debug("start: " + offset + "; limit:" + limit);
297
298 // sorting
299 Sort groupSort = null;
300 Sort withinGroupSort = Sort.RELEVANCE;
301 if(sortFields != null && sortFields.length > 0){
302 if(sortFields[0] != SortField.FIELD_SCORE){
303 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
304 }
305 groupSort = new Sort(sortFields);
306 } else {
307 groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
308 }
309
310 // perform the search (needs two passes for grouping)
311 if(logger.isDebugEnabled()){
312 logger.debug("Grouping: sortFields=" + sortFields + ", groupByField=" + groupByField +
313 ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
314 }
315 // - first pass
316 TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, withinGroupSort, limit);
317
318 getSearcher().search(fullQuery, filter , firstPassCollector);
319 Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
320
321 if (topGroups == null) {
322 return null;
323 }
324 // - second pass
325 boolean getScores = true;
326 boolean getMaxScores = true;
327 boolean fillFields = true;
328 TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
329 TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
330 groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields
331 );
332 getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
333
334 TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
335
336 // get max score from very first result
337 float maxScore = groupsResult.groups[0].maxScore;
338 if(logger.isDebugEnabled()){
339 logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
340 ", totalGroupCount=" + allGroupsCollector.getGroupCount() + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
341 }
342 TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
343
344 return topGroupsWithMaxScore;
345 }
346
347 /**
348 * @param clazz
349 */
350 protected Query expandQuery() {
351 Query fullQuery;
352 if(clazz != null){
353 BooleanQuery filteredQuery = new BooleanQuery();
354 BooleanQuery classFilter = new BooleanQuery();
355
356 Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
357 TermQuery termQuery = new TermQuery(t);
358
359 classFilter.setBoost(0);
360 classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
361
362 filteredQuery.add(this.query, BooleanClause.Occur.MUST);
363 filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
364
365 fullQuery = filteredQuery;
366 } else {
367 fullQuery = this.query;
368 }
369 return fullQuery;
370 }
371
372 public void setQuery(Query query) {
373 this.query = query;
374 }
375
376 public Query getQuery() {
377 return query;
378 }
379
380 public Query getExpandedQuery() {
381 expandQuery();
382 return query;
383 }
384
385 public SortField[] getSortFields() {
386 return sortFields;
387 }
388
389 public void setSortFields(SortField[] sortFields) {
390 this.sortFields = sortFields;
391 }
392
393 public void setHighlightFields(String[] textFieldNamesAsArray) {
394 this.highlightFields = textFieldNamesAsArray;
395 }
396
397 public String[] getHighlightFields() {
398 return this.highlightFields;
399 }
400
401 /**
402 * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
403 *
404 * @author a.kohlbecker
405 * @date Oct 4, 2012
406 *
407 */
408 public class TopGroupsWithMaxScore{
409 public TopGroups<String> topGroups;
410 public float maxScore = Float.NaN;
411
412 TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
413 this.maxScore = maxScore;
414 TopGroups<String> newTopGroups;
415 if(offset > 0){
416 GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
417 for(int i = offset; i < topGroups.groups.length; i++){
418 newGroupDocs[i - offset] = topGroups.groups[i];
419 }
420 newTopGroups = new TopGroups<String>(
421 topGroups.groupSort,
422 topGroups.withinGroupSort,
423 topGroups.totalHitCount,
424 topGroups.totalGroupedHitCount,
425 newGroupDocs);
426 } else {
427 newTopGroups = topGroups;
428 }
429 this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
430 }
431
432 }
433
434 }