Project

General

Profile

Download (13.7 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2011 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.api.service.search;
10

    
11
import java.io.IOException;
12
import java.util.Arrays;
13
import java.util.Collection;
14

    
15
import org.apache.log4j.Logger;
16
import org.apache.lucene.analysis.Analyzer;
17
import org.apache.lucene.queryparser.classic.ParseException;
18
import org.apache.lucene.search.BooleanClause.Occur;
19
import org.apache.lucene.search.BooleanQuery;
20
import org.apache.lucene.search.BooleanQuery.Builder;
21
import org.apache.lucene.search.IndexSearcher;
22
import org.apache.lucene.search.MultiCollector;
23
import org.apache.lucene.search.Query;
24
import org.apache.lucene.search.Sort;
25
import org.apache.lucene.search.SortField;
26
import org.apache.lucene.search.TopDocs;
27
import org.apache.lucene.search.grouping.GroupDocs;
28
import org.apache.lucene.search.grouping.SearchGroup;
29
import org.apache.lucene.search.grouping.TopGroups;
30
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
31
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
32
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
33
import org.apache.lucene.util.BytesRef;
34

    
35
import eu.etaxonomy.cdm.model.common.CdmBase;
36
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
37
import eu.etaxonomy.cdm.model.description.TextData;
38
import eu.etaxonomy.cdm.model.taxon.Taxon;
39
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
40

    
41
/**
42
 *
43
 * @author Andreas Kohlbecker
44
 \* @since Dec 21, 2011
45
 *
46
 */
47
public class LuceneSearch {
48

    
49
    protected String groupByField = "id";
50

    
51
    public final static String ID_FIELD = "id";
52

    
53
    public static final Logger logger = Logger.getLogger(LuceneSearch.class);
54

    
55
    protected ILuceneIndexToolProvider toolProvider;
56

    
57
    protected IndexSearcher searcher;
58

    
59
    protected SortField[] sortFields;
60

    
61
    private Class<? extends CdmBase> directorySelectClass;
62

    
63
    private BooleanQuery filter = null;
64

    
65
    protected Class<? extends CdmBase> getDirectorySelectClass() {
66
        return pushAbstractBaseTypeDown(directorySelectClass);
67
    }
68

    
69
    /**
70
     * classFilter
71
     */
72
    protected Class<? extends CdmBase> cdmTypeRestriction;
73

    
74

    
75
    public Class<? extends CdmBase> getCdmTypRestriction() {
76
        return cdmTypeRestriction;
77
    }
78

    
79
    /**
80
     * @return the filter
81
     */
82
    public BooleanQuery getFilter() {
83
        return filter;
84
    }
85

    
86
    /**
87
     * @param filter the filter to set
88
     */
89
    public void setFilter(BooleanQuery filter) {
90
        this.filter = filter;
91
    }
92

    
93
    /**
94
     * Sets the Class to use as filter criterion, in case the supplied Class equals the
95
     * <code>directorySelectClass</code> the Class is set to <code>null</code>
96
     * @param clazz
97
     */
98
    public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
99

    
100
        /*
101
         * NOTE:
102
         * we must not use the getter of directorySelectClass
103
         * since we need the abstract base classes here!!!!
104
         */
105
        if(clazz != null && clazz.equals(directorySelectClass)){
106
            clazz = null;
107
        }
108
        this.cdmTypeRestriction = clazz;
109
    }
110

    
111
    /**
112
     * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
113
     * otherwise PriorityQueue will produce an exception since it
114
     * will always add 1 to the maxhits so Integer.MAX_VALUE
115
     * would become Integer.MIN_VALUE
116
     */
117
    public final int MAX_HITS_ALLOWED = 10000;
118

    
119
    protected BooleanQuery query;
120

    
121
    protected String[] highlightFields = new String[0];
122

    
123
    private int maxDocsPerGroup = 10;
124

    
125

    
126
    public int getMaxDocsPerGroup() {
127
        return maxDocsPerGroup;
128
    }
129

    
130
    public void setMaxDocsPerGroup(int maxDocsPerGroup) {
131
        this.maxDocsPerGroup = maxDocsPerGroup;
132
    }
133

    
134
    /**
135
     * @param session
136
     */
137
    public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
138
         this.toolProvider = toolProvider;
139
         this.directorySelectClass = directorySelectClass;
140
    }
141

    
142
    /**
143
     * @param session
144
     */
145
    public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
146
        this.toolProvider = toolProvider;
147
        this.directorySelectClass = directorySelectClass;
148
        this.groupByField = groupByField;
149
    }
150

    
151
    /**
152
     * TODO the abstract base class DescriptionElementBase can not be used, so
153
     * we are using an arbitrary subclass to find the DirectoryProvider, future
154
     * versions of hibernate search my allow using abstract base classes see
155
     * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
156
     *
157
     * @param type must not be null
158
     * @return
159
     */
160
    private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
161
        Class<? extends CdmBase> returnType = type;
162
        if (type.equals(DescriptionElementBase.class)) {
163
            returnType = TextData.class;
164
        }
165
        if (type.equals(TaxonBase.class)) {
166
            returnType = Taxon.class;
167
        }
168
//        if (type.equals(TaxonName.class)) {
169
//            returnType = NonViralName.class;
170
//        }
171
        return returnType;
172
    }
173

    
174
    protected LuceneSearch() {
175

    
176
    }
177

    
178
    /**
179
     * @return
180
     */
181
    public IndexSearcher getSearcher() {
182
        if(searcher == null){
183
            searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
184
//            searcher.setDefaultFieldSortScoring(true, true);
185
        }
186
        return searcher;
187
    }
188

    
189
    /**
190
     * Convenience method which delegated the call to the available
191
     * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
192
     *
193
     * @return the Analyzer suitable for the <code>directorySelectClass</code>
194
     * of the LuceneSearch
195
     */
196
    public Analyzer getAnalyzer() {
197
        return toolProvider.getAnalyzerFor(directorySelectClass);
198
    }
199

    
200
    /**
201
     * @param luceneQueryString
202
     * @param cdmTypeRestriction the type as additional filter criterion
203
     * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
204
     * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
205
     * @return
206
     * @throws ParseException
207
     * @throws IOException
208
     */
209
    public TopGroups<BytesRef> executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
210

    
211
        Query luceneQuery = parse(luceneQueryString);
212
        setQuery(luceneQuery);
213
        return executeSearch(pageSize, pageNumber);
214
    }
215

    
216
    /**
217
     * @param luceneQueryString
218
     * @return
219
     * @throws ParseException
220
     */
221
    public Query parse(String luceneQueryString) throws ParseException {
222
        logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
223
        Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass, false).parse(luceneQueryString);
224
        return luceneQuery;
225
    }
226

    
227
    /**
228
     * @param maxNoOfHits
229
     * @return
230
     * @throws IOException
231
     */
232
    public TopDocs executeSearch(int maxNoOfHits) throws IOException {
233
        BooleanQuery fullQuery = expandQuery();
234
        logger.info("lucene query string to be parsed: " + fullQuery.toString());
235
        return getSearcher().search(fullQuery, maxNoOfHits, Sort.RELEVANCE, true, true);
236

    
237
    }
238
    /**
239
     * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
240
     * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
241
     * @return
242
     * @throws ParseException
243
     * @throws IOException
244
     */
245
    public TopGroups<BytesRef> executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
246

    
247

    
248
        if(pageNumber == null || pageNumber < 0){
249
            pageNumber = 0;
250
        }
251
        if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
252
            pageSize = MAX_HITS_ALLOWED;
253
            logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
254
        }
255

    
256
        BooleanQuery fullQuery = expandQuery();
257
        logger.info("final query: " + fullQuery.toString());
258

    
259
        int offset = pageNumber * pageSize;
260
        int limit = (pageNumber + 1) * pageSize - 1 ;
261
        logger.debug("start: " + offset + "; limit:" + limit);
262

    
263
        // sorting
264
        Sort groupSort = null;
265
        Sort withinGroupSort = Sort.RELEVANCE;
266
        if(sortFields != null && sortFields.length > 0){
267
            groupSort = new Sort(sortFields);
268
        } else {
269
            groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
270
        }
271

    
272
        // perform the search (needs two passes for grouping)
273
        if(logger.isDebugEnabled()){
274
            logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
275
                    ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
276
        }
277
        // - first pass
278
        TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, groupSort, limit);
279

    
280
        getSearcher().search(fullQuery, firstPassCollector);
281
        Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
282

    
283
        if (topGroups == null) {
284
              return null;
285
        }
286
        // - flags for second pass
287
        boolean getScores = false;
288
        boolean getMaxScores = true;
289
        if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
290
            getMaxScores = false;
291
            // see inner class TopGroupsWithMaxScore
292
            logger.error("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
293
        }
294
        boolean fillFields = true;
295
        TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
296
        TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
297
                groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
298
                getMaxScores, fillFields
299
                );
300
        getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
301

    
302
        TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
303

    
304
        // --- set the max score for the group results
305

    
306
        // get max score from very first result
307
        float maxScore = groupsResult.groups[0].maxScore;
308

    
309
        if(logger.isDebugEnabled()){
310
            logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
311
                    ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
312
                    ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
313
        }
314

    
315
        TopGroups<BytesRef> newTopGroups;
316
        if(offset > 0){
317
            GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[groupsResult.groups.length - offset];
318
            for(int i = offset; i < groupsResult.groups.length; i++){
319
                newGroupDocs[i - offset] = groupsResult.groups[i];
320
            }
321
            newTopGroups = new TopGroups<BytesRef>(
322
                    groupsResult.groupSort,
323
                    groupsResult.withinGroupSort,
324
                    groupsResult.totalHitCount,
325
                    groupsResult.totalGroupedHitCount,
326
                        newGroupDocs,
327
                        maxScore);
328
        } else {
329
            newTopGroups = groupsResult;
330
        }
331
        TopGroups<BytesRef> topGroupsWithMaxScore = new TopGroups<BytesRef>(newTopGroups, allGroupsCollector.getGroupCount());
332
        // --- done with max score for the group results
333

    
334
        return topGroupsWithMaxScore;
335
    }
336

    
337
    /**
338
     * expands the query by adding a type restriction if the
339
     * <code>cdmTypeRestriction</code> is not <code>NULL</code>
340
     * and adds the <code>filter</code> as Boolean query
341
     * clause with {@link Occur#FILTER}
342
     */
343
    protected BooleanQuery expandQuery() {
344
        BooleanQuery fullQuery = null;
345
        Builder fullQueryBuilder = null;
346

    
347
        if(cdmTypeRestriction != null){
348
            fullQueryBuilder = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
349
        }
350

    
351
        if(filter != null) {
352
            if(fullQueryBuilder == null) {
353
                fullQueryBuilder = new Builder();
354
                fullQueryBuilder.add(this.query, Occur.MUST);
355
            }
356
            fullQueryBuilder.add(filter, Occur.FILTER);
357
        }
358

    
359
        if(fullQueryBuilder != null) {
360
            fullQuery = fullQueryBuilder.build();
361
        } else {
362
            fullQuery = this.query;
363
        }
364

    
365
        logger.debug("expandedQuery: " + fullQuery.toString());
366
        return fullQuery;
367
    }
368

    
369
    public void setQuery(Query query) {
370
        if( query instanceof BooleanQuery) {
371
            this.query = (BooleanQuery)query;
372
        } else {
373
            Builder builder = new Builder();
374
            this.query = builder.add(query, Occur.MUST).build();
375
        }
376
    }
377

    
378
    public BooleanQuery getQuery() {
379
        return query;
380
    }
381

    
382
    public BooleanQuery getExpandedQuery() {
383
        expandQuery();
384
        return query;
385
    }
386

    
387
    public SortField[] getSortFields() {
388
        return sortFields;
389
    }
390

    
391
    public void setSortFields(SortField[] sortFields) {
392
        this.sortFields = sortFields;
393
    }
394

    
395
    public void setHighlightFields(String[] textFieldNamesAsArray) {
396
        this.highlightFields = textFieldNamesAsArray;
397
    }
398

    
399
    public String[] getHighlightFields() {
400
        return this.highlightFields;
401
    }
402

    
403
}
(11-11/16)