first implementation: existing tests all Ok, grouping does not yet really work, index...
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / LuceneSearch.java
1 // $Id$
2 /**
3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13 import java.util.Collection;
14
15 import org.apache.log4j.Logger;
16 import org.apache.lucene.analysis.Analyzer;
17 import org.apache.lucene.index.IndexReader;
18 import org.apache.lucene.index.Term;
19 import org.apache.lucene.queryParser.ParseException;
20 import org.apache.lucene.queryParser.QueryParser;
21 import org.apache.lucene.search.BooleanClause;
22 import org.apache.lucene.search.BooleanQuery;
23 import org.apache.lucene.search.Hits;
24 import org.apache.lucene.search.IndexSearcher;
25 import org.apache.lucene.search.MultiCollector;
26 import org.apache.lucene.search.Query;
27 import org.apache.lucene.search.ScoreDoc;
28 import org.apache.lucene.search.Searcher;
29 import org.apache.lucene.search.Sort;
30 import org.apache.lucene.search.SortField;
31 import org.apache.lucene.search.TermQuery;
32 import org.apache.lucene.search.TopDocs;
33 import org.apache.lucene.search.grouping.AllGroupsCollector;
34 import org.apache.lucene.search.grouping.FirstPassGroupingCollector;
35 import org.apache.lucene.search.grouping.SearchGroup;
36 import org.apache.lucene.search.grouping.SecondPassGroupingCollector;
37 import org.apache.lucene.search.grouping.TopGroups;
38 import org.hibernate.Session;
39 import org.hibernate.search.Search;
40 import org.hibernate.search.SearchFactory;
41 import org.hibernate.search.engine.DocumentBuilder;
42 import org.hibernate.search.reader.ReaderProvider;
43 import org.hibernate.search.store.DirectoryProvider;
44
45 import eu.etaxonomy.cdm.model.common.CdmBase;
46 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
47 import eu.etaxonomy.cdm.model.description.TextData;
48 import eu.etaxonomy.cdm.model.taxon.Taxon;
49 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
50
51 /**
52 *
53 * @author Andreas Kohlbecker
54 * @date Dec 21, 2011
55 *
56 */
57 public class LuceneSearch {
58
59 private static final String GROUP_BY_FIELD = "id";
60
61 public static final Logger logger = Logger.getLogger(LuceneSearch.class);
62
63 protected Session session;
64
65 protected IndexSearcher searcher;
66
67 private SortField[] sortFields;
68
69 private Class<? extends CdmBase> directorySelectClass;
70
71 protected Class<? extends CdmBase> getDirectorySelectClass() {
72 return pushAbstractBaseTypeDown(directorySelectClass);
73 }
74
75 /**
76 * classFilter
77 */
78 private Class<? extends CdmBase> clazz;
79
80
81 public Class<? extends CdmBase> getClazz() {
82 return clazz;
83 }
84
85 /**
86 * Sets the Class to use as filter criterion, in case the supplied Class equals the
87 * <code>directorySelectClass</code> the Class is set to <code>null</code>
88 * @param clazz
89 */
90 public void setClazz(Class<? extends CdmBase> clazz) {
91
92 /*
93 * NOTE:
94 * we must not use the getter of directorySelectClass
95 * since we need the abstract base classes here!!!!
96 */
97 if(clazz != null && clazz.equals(directorySelectClass)){
98 clazz = null;
99 }
100 this.clazz = clazz;
101 }
102
103 /**
104 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
105 * otherwise PriorityQueue will produce an exception since it
106 * will always add 1 to the maxhits so Integer.MAX_VALUE
107 * would become Integer.MIN_VALUE
108 */
109 public final int MAX_HITS_ALLOWED = 10000;
110
111 protected Query query;
112
113 protected String[] highlightFields = new String[0];
114
115
116 /**
117 * @param session
118 */
119 public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
120 this.session = session;
121 this.directorySelectClass = directorySelectClass;
122 }
123
124 /**
125 * TODO the abstract base class DescriptionElementBase can not be used, so
126 * we are using an arbitraty subclass to find the DirectoryProvider, future
127 * versions of hibernate search my allow using abstract base classes see
128 * http
129 * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
130 * -a-given-class-in-java
131 *
132 * @param type must not be null
133 * @return
134 */
135 protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
136 if (type.equals(DescriptionElementBase.class)) {
137 type = TextData.class;
138 }
139 if (type.equals(TaxonBase.class)) {
140 type = Taxon.class;
141 }
142 return type;
143 }
144
145 protected LuceneSearch() {
146
147 }
148
149 /**
150 * @return
151 */
152 public Searcher getSearcher() {
153 if(searcher == null){
154 searcher = new IndexSearcher(getIndexReader());
155 searcher.setDefaultFieldSortScoring(true, true);
156 }
157 return searcher;
158 }
159
160 /**
161 * @return
162 */
163 public IndexReader getIndexReader() {
164 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
165
166 DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
167 logger.info(directoryProviders[0].getDirectory().toString());
168
169 ReaderProvider readerProvider = searchFactory.getReaderProvider();
170 IndexReader reader = readerProvider.openReader(directoryProviders[0]);
171 return reader;
172 }
173
174 /**
175 * @return
176 */
177 public QueryParser getQueryParser() {
178 Analyzer analyzer = getAnalyzer();
179 QueryParser parser = new QueryParser("titleCache", analyzer);
180 return parser;
181 }
182
183 /**
184 * @return
185 */
186 public Analyzer getAnalyzer() {
187 SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
188 Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
189 return analyzer;
190 }
191
192 /**
193 * @param luceneQueryString
194 * @param clazz the type as additional filter criterion
195 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
196 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
197 * @return
198 * @throws ParseException
199 * @throws IOException
200 */
201 public TopGroups executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
202
203 Query luceneQuery = parse(luceneQueryString);
204 this.query = luceneQuery;
205
206 return executeSearch(pageSize, pageNumber);
207 }
208
209 /**
210 * @param luceneQueryString
211 * @return
212 * @throws ParseException
213 */
214 public Query parse(String luceneQueryString) throws ParseException {
215 logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
216 Query luceneQuery = getQueryParser().parse(luceneQueryString);
217 return luceneQuery;
218 }
219
220 /**
221 * @param luceneQuery
222 * @param clazz the type as additional filter criterion
223 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
224 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
225 * @return
226 * @throws ParseException
227 * @throws IOException
228 */
229 public TopGroups executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
230
231
232 if(pageNumber == null || pageNumber < 0){
233 pageNumber = 0;
234 }
235 if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
236 pageSize = MAX_HITS_ALLOWED;
237 logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
238 }
239
240 Query fullQuery = expandQuery();
241
242 logger.info("final query: " + fullQuery.toString());
243
244 int offset = pageNumber * pageSize;
245 int limit = (pageNumber + 1) * pageSize - 1 ;
246
247 logger.debug("start: " + offset + "; limit:" + limit);
248
249 // TopDocs topDocs = null;
250
251 // sort must be non null default: Sort.RELEVANCE
252 Sort groupSort = null;
253 Sort withinGroupSort = Sort.RELEVANCE;
254 if(sortFields != null && sortFields.length > 0){
255 Sort sort = new Sort(sortFields);
256 groupSort = new Sort(sortFields);
257 // topDocs = getSearcher().search(fullQuery, null, limit, sort);
258 } else {
259 groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
260 // topDocs = getSearcher().search(fullQuery, null, limit);
261 }
262 FirstPassGroupingCollector groupingCollector_1 = new FirstPassGroupingCollector(GROUP_BY_FIELD, withinGroupSort, limit);
263 getSearcher().search(fullQuery, groupingCollector_1);
264
265 Collection<SearchGroup> topGroups = groupingCollector_1.getTopGroups(offset, true);
266
267 if (topGroups == null) {
268 return null;
269 }
270
271 boolean getScores = true;
272 boolean getMaxScores = true;
273 boolean fillFields = true;
274 AllGroupsCollector c3 = new AllGroupsCollector(GROUP_BY_FIELD);
275 SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(GROUP_BY_FIELD, topGroups, groupSort, withinGroupSort, limit, getScores, getMaxScores, fillFields);
276 getSearcher().search(fullQuery, MultiCollector.wrap(c2, c3));
277
278 TopGroups groupsResult = c2.getTopGroups(offset);
279 groupsResult = new TopGroups(groupsResult, c3.getGroupCount());
280
281 return groupsResult;
282
283
284 //TODO when switched to Lucene 3.x which is included in hibernate 4.x
285 // use TopDocCollector.topDocs(int start, int howMany);
286 // since this method might be more memory save than our own implementation
287 //
288 // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
289 //
290 // TopDocs topDocs = hitCollector.topDocs();
291 // ScoreDoc[] scoreDocs = topDocs.scoreDocs;
292
293 // int docsAvailableInPage = Math.min(scoreDocs.length - offset, pageSize);
294 // logger.debug("docsAvailableInPage:" + docsAvailableInPage);
295 //
296 // ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
297 // for(int i = 0; i < docsAvailableInPage; i++){
298 // pagedDocs[i] = scoreDocs[offset + i];
299 // }
300 // TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
301 //
302 /////////////////////////////////////////////
303
304 // return pagedTopDocs;
305 }
306
307 /**
308 * @param clazz
309 */
310 protected Query expandQuery() {
311 Query fullQuery;
312 if(clazz != null){
313 BooleanQuery filteredQuery = new BooleanQuery();
314 BooleanQuery classFilter = new BooleanQuery();
315
316 Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
317 TermQuery termQuery = new TermQuery(t);
318
319 classFilter.setBoost(0);
320 classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
321
322 filteredQuery.add(this.query, BooleanClause.Occur.MUST);
323 filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
324
325 fullQuery = filteredQuery;
326 } else {
327 fullQuery = this.query;
328 }
329 return fullQuery;
330 }
331
332 public void setQuery(Query query) {
333 this.query = query;
334 }
335
336 public Query getQuery() {
337 return query;
338 }
339
340 public Query getExpandedQuery() {
341 expandQuery();
342 return query;
343 }
344
345 public SortField[] getSortFields() {
346 return sortFields;
347 }
348
349 public void setSortFields(SortField[] sortFields) {
350 this.sortFields = sortFields;
351 }
352
353 public void setHighlightFields(String[] textFieldNamesAsArray) {
354 this.highlightFields = textFieldNamesAsArray;
355
356 }
357
358 public String[] getHighlightFields() {
359 return this.highlightFields;
360 }
361
362 }