INameService : added findByNameFuzzySearch method to search names by fuzzy matching
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
1 // $Id$
2 /**
3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13 import java.util.Collection;
14
15 import org.apache.log4j.Logger;
16 import org.apache.lucene.analysis.Analyzer;
17 import org.apache.lucene.index.IndexReader;
18 import org.apache.lucene.index.Term;
19 import org.apache.lucene.queryParser.ParseException;
20 import org.apache.lucene.queryParser.QueryParser;
21 import org.apache.lucene.search.BooleanClause;
22 import org.apache.lucene.search.BooleanQuery;
23 import org.apache.lucene.search.IndexSearcher;
24 import org.apache.lucene.search.MultiCollector;
25 import org.apache.lucene.search.Query;
26 import org.apache.lucene.search.Sort;
27 import org.apache.lucene.search.SortField;
28 import org.apache.lucene.search.TermQuery;
29 import org.apache.lucene.search.TopDocs;
30 import org.apache.lucene.search.grouping.GroupDocs;
31 import org.apache.lucene.search.grouping.SearchGroup;
32 import org.apache.lucene.search.grouping.TermAllGroupsCollector;
33 import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
34 import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
35 import org.apache.lucene.search.grouping.TopGroups;
36 import org.hibernate.Session;
37 import org.hibernate.search.ProjectionConstants;
38 import org.hibernate.search.Search;
39 import org.hibernate.search.SearchFactory;
40
41 import eu.etaxonomy.cdm.config.Configuration;
42 import eu.etaxonomy.cdm.hibernate.search.GroupByTaxonClassBridge;
43 import eu.etaxonomy.cdm.model.common.CdmBase;
44 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
45 import eu.etaxonomy.cdm.model.description.TextData;
46 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
47 import eu.etaxonomy.cdm.model.name.NonViralName;
48 import eu.etaxonomy.cdm.model.taxon.Taxon;
49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
50
51 /**
52 *
53 * @author Andreas Kohlbecker
54 * @date Dec 21, 2011
55 *
56 */
57 public class LuceneSearch {
58
59 private static final String GROUP_BY_FIELD = GroupByTaxonClassBridge.GROUPBY_TAXON_FIELD;
60
61 public final static String ID_FIELD = "id";
62
63 public static final Logger logger = Logger.getLogger(LuceneSearch.class);
64
65 protected Session session;
66
67 protected IndexSearcher searcher;
68
69 private SortField[] sortFields;
70
71 private Class<? extends CdmBase> directorySelectClass;
72
73 protected Class<? extends CdmBase> getDirectorySelectClass() {
74 return pushAbstractBaseTypeDown(directorySelectClass);
75 }
76
77 /**
78 * classFilter
79 */
80 private Class<? extends CdmBase> clazz;
81
82
83 public Class<? extends CdmBase> getClazz() {
84 return clazz;
85 }
86
87 /**
88 * Sets the Class to use as filter criterion, in case the supplied Class equals the
89 * <code>directorySelectClass</code> the Class is set to <code>null</code>
90 * @param clazz
91 */
92 public void setClazz(Class<? extends CdmBase> clazz) {
93
94 /*
95 * NOTE:
96 * we must not use the getter of directorySelectClass
97 * since we need the abstract base classes here!!!!
98 */
99 if(clazz != null && clazz.equals(directorySelectClass)){
100 clazz = null;
101 }
102 this.clazz = clazz;
103 }
104
105 /**
106 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
107 * otherwise PriorityQueue will produce an exception since it
108 * will always add 1 to the maxhits so Integer.MAX_VALUE
109 * would become Integer.MIN_VALUE
110 */
111 public final int MAX_HITS_ALLOWED = 10000;
112
113 protected Query query;
114
115 protected String[] highlightFields = new String[0];
116
117 private int maxDocsPerGroup = 10;
118
119
120 public int getMaxDocsPerGroup() {
121 return maxDocsPerGroup;
122 }
123
124 public void setMaxDocsPerGroup(int maxDocsPerGroup) {
125 this.maxDocsPerGroup = maxDocsPerGroup;
126 }
127
128 /**
129 * @param session
130 */
131 public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
132 this.session = session;
133 this.directorySelectClass = directorySelectClass;
134 }
135
136 /**
137 * TODO the abstract base class DescriptionElementBase can not be used, so
138 * we are using an arbitraty subclass to find the DirectoryProvider, future
139 * versions of hibernate search my allow using abstract base classes see
140 * http
141 * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
142 * -a-given-class-in-java
143 *
144 * @param type must not be null
145 * @return
146 */
147 protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
148 if (type.equals(DescriptionElementBase.class)) {
149 type = TextData.class;
150 }
151 if (type.equals(TaxonBase.class)) {
152 type = Taxon.class;
153 }
154 if (type.equals(TaxonNameBase.class)) {
155 type = NonViralName.class;
156 }
157 return type;
158 }
159
160 protected LuceneSearch() {
161
162 }
163
164 /**
165 * @return
166 */
167 public IndexSearcher getSearcher() {
168 if(searcher == null){
169 searcher = new IndexSearcher(getIndexReader());
170 searcher.setDefaultFieldSortScoring(true, true);
171 }
172 return searcher;
173 }
174
175 /**
176 * @return
177 */
178 public IndexReader getIndexReader() {
179 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
180
181 // OLD
182 // DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
183 // logger.info(directoryProviders[0].getDirectory().toString());
184
185 // ReaderProvider readerProvider = searchFactory.getReaderProvider();
186 // IndexReader reader = readerProvider.openReader(directoryProviders[0]);
187
188 IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
189 return reader;
190 }
191
192 /**
193 * @return
194 */
195 public QueryParser getQueryParser() {
196 Analyzer analyzer = getAnalyzer();
197 QueryParser parser = new QueryParser(Configuration.luceneVersion, "titleCache", analyzer);
198 return parser;
199 }
200
201 /**
202 * @return
203 */
204 public Analyzer getAnalyzer() {
205 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
206 Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
207 return analyzer;
208 }
209
210 /**
211 * @param luceneQueryString
212 * @param clazz the type as additional filter criterion
213 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
214 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
215 * @return
216 * @throws ParseException
217 * @throws IOException
218 */
219 public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
220
221 Query luceneQuery = parse(luceneQueryString);
222 this.query = luceneQuery;
223
224 return executeSearch(pageSize, pageNumber);
225 }
226
227 /**
228 * @param luceneQueryString
229 * @return
230 * @throws ParseException
231 */
232 public Query parse(String luceneQueryString) throws ParseException {
233 logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
234 Query luceneQuery = getQueryParser().parse(luceneQueryString);
235 return luceneQuery;
236 }
237
238 /**
239 * @param maxNoOfHits
240 * @return
241 * @throws IOException
242 */
243 public TopDocs executeSearch(int maxNoOfHits) throws IOException {
244 Query fullQuery = expandQuery();
245 logger.info("lucene query string to be parsed: " + fullQuery.toString());
246 return getSearcher().search(fullQuery, maxNoOfHits);
247
248 }
249 /**
250 * @param luceneQuery
251 * @param clazz the type as additional filter criterion
252 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
253 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
254 * @return
255 * @throws ParseException
256 * @throws IOException
257 */
258 public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
259
260
261 if(pageNumber == null || pageNumber < 0){
262 pageNumber = 0;
263 }
264 if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
265 pageSize = MAX_HITS_ALLOWED;
266 logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
267 }
268
269 Query fullQuery = expandQuery();
270 logger.info("final query: " + fullQuery.toString());
271
272 int offset = pageNumber * pageSize;
273 int limit = (pageNumber + 1) * pageSize - 1 ;
274 logger.debug("start: " + offset + "; limit:" + limit);
275
276 // sorting
277 Sort groupSort = null;
278 Sort withinGroupSort = Sort.RELEVANCE;
279 if(sortFields != null && sortFields.length > 0){
280 if(sortFields[0] != SortField.FIELD_SCORE){
281 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
282 }
283 groupSort = new Sort(sortFields);
284 } else {
285 groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
286 }
287
288 // perform the search (needs two passes for grouping)
289 // - first pass
290 TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(GROUP_BY_FIELD, withinGroupSort, limit);
291 getSearcher().search(fullQuery, firstPassCollector);
292 Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
293
294 if (topGroups == null) {
295 return null;
296 }
297 // - second pass
298 boolean getScores = true;
299 boolean getMaxScores = true;
300 boolean fillFields = true;
301 TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(GROUP_BY_FIELD);
302 TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(GROUP_BY_FIELD, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields);
303 getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
304
305 TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
306
307 // get max score from very first result
308 float maxScore = groupsResult.groups[0].maxScore;
309 TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
310
311 return topGroupsWithMaxScore;
312 }
313
314 /**
315 * @param clazz
316 */
317 protected Query expandQuery() {
318 Query fullQuery;
319 if(clazz != null){
320 BooleanQuery filteredQuery = new BooleanQuery();
321 BooleanQuery classFilter = new BooleanQuery();
322
323 Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
324 TermQuery termQuery = new TermQuery(t);
325
326 classFilter.setBoost(0);
327 classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
328
329 filteredQuery.add(this.query, BooleanClause.Occur.MUST);
330 filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
331
332 fullQuery = filteredQuery;
333 } else {
334 fullQuery = this.query;
335 }
336 return fullQuery;
337 }
338
339 public void setQuery(Query query) {
340 this.query = query;
341 }
342
343 public Query getQuery() {
344 return query;
345 }
346
347 public Query getExpandedQuery() {
348 expandQuery();
349 return query;
350 }
351
352 public SortField[] getSortFields() {
353 return sortFields;
354 }
355
356 public void setSortFields(SortField[] sortFields) {
357 this.sortFields = sortFields;
358 }
359
360 public void setHighlightFields(String[] textFieldNamesAsArray) {
361 this.highlightFields = textFieldNamesAsArray;
362 }
363
364 public String[] getHighlightFields() {
365 return this.highlightFields;
366 }
367
368 /**
369 * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
370 *
371 * @author a.kohlbecker
372 * @date Oct 4, 2012
373 *
374 */
375 public class TopGroupsWithMaxScore{
376 public TopGroups<String> topGroups;
377 public float maxScore = Float.NaN;
378
379 TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
380 this.maxScore = maxScore;
381 TopGroups<String> newTopGroups;
382 if(offset > 0){
383 GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
384 for(int i = offset; i < topGroups.groups.length; i++){
385 newGroupDocs[i - offset] = topGroups.groups[i];
386 }
387 newTopGroups = new TopGroups<String>(
388 topGroups.groupSort,
389 topGroups.withinGroupSort,
390 topGroups.totalHitCount,
391 topGroups.totalGroupedHitCount,
392 newGroupDocs);
393 } else {
394 newTopGroups = topGroups;
395 }
396 this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
397 }
398
399 }
400
401 }