Project

General

Profile

Download (13.5 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2011 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.api.service.search;
10

    
11
import java.io.IOException;
12
import java.util.Arrays;
13
import java.util.Collection;
14

    
15
import org.apache.log4j.Logger;
16
import org.apache.lucene.analysis.Analyzer;
17
import org.apache.lucene.queryparser.classic.ParseException;
18
import org.apache.lucene.search.BooleanClause.Occur;
19
import org.apache.lucene.search.BooleanQuery;
20
import org.apache.lucene.search.BooleanQuery.Builder;
21
import org.apache.lucene.search.IndexSearcher;
22
import org.apache.lucene.search.MultiCollector;
23
import org.apache.lucene.search.Query;
24
import org.apache.lucene.search.Sort;
25
import org.apache.lucene.search.SortField;
26
import org.apache.lucene.search.TopDocs;
27
import org.apache.lucene.search.grouping.GroupDocs;
28
import org.apache.lucene.search.grouping.SearchGroup;
29
import org.apache.lucene.search.grouping.TopGroups;
30
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
31
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
32
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
33
import org.apache.lucene.util.BytesRef;
34

    
35
import eu.etaxonomy.cdm.model.common.CdmBase;
36
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
37
import eu.etaxonomy.cdm.model.description.TextData;
38
import eu.etaxonomy.cdm.model.taxon.Taxon;
39
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
40

    
41
/**
42
 *
43
 * @author Andreas Kohlbecker
44
 * @since Dec 21, 2011
45
 *
46
 */
47
public class LuceneSearch {
48

    
49
    protected String groupByField = "id";
50

    
51
    public final static String ID_FIELD = "id";
52

    
53
    public static final Logger logger = Logger.getLogger(LuceneSearch.class);
54

    
55
    protected ILuceneIndexToolProvider toolProvider;
56

    
57
    protected IndexSearcher searcher;
58

    
59
    protected SortField[] sortFields;
60

    
61
    private Class<? extends CdmBase> directorySelectClass;
62

    
63
    private BooleanQuery filter = null;
64

    
65
    //class filter
66
    protected Class<? extends CdmBase> cdmTypeRestriction;
67

    
68
    /**
69
     * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
70
     * otherwise PriorityQueue will produce an exception since it
71
     * will always add 1 to the maxhits so Integer.MAX_VALUE
72
     * would become Integer.MIN_VALUE
73
     */
74
    public final int MAX_HITS_ALLOWED = 10000;
75

    
76
    protected BooleanQuery query;
77

    
78
    protected String[] highlightFields = new String[0];
79

    
80
    private int maxDocsPerGroup = 10;
81

    
82
    protected Class<? extends CdmBase> getDirectorySelectClass() {
83
        return pushAbstractBaseTypeDown(directorySelectClass);
84
    }
85

    
86
    public Class<? extends CdmBase> getCdmTypRestriction() {
87
        return cdmTypeRestriction;
88
    }
89

    
90
    //filter
91
    public BooleanQuery getFilter() {
92
        return filter;
93
    }
94
    public void setFilter(BooleanQuery filter) {
95
        this.filter = filter;
96
    }
97

    
98
    /**
99
     * Sets the Class to use as filter criterion, in case the supplied Class equals the
100
     * <code>directorySelectClass</code> the Class is set to <code>null</code>
101
     * @param clazz
102
     */
103
    public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
104

    
105
        /*
106
         * NOTE:
107
         * we must not use the getter of directorySelectClass
108
         * since we need the abstract base classes here!!!!
109
         */
110
        if(clazz != null && clazz.equals(directorySelectClass)){
111
            clazz = null;
112
        }
113
        this.cdmTypeRestriction = clazz;
114
    }
115

    
116
    public int getMaxDocsPerGroup() {
117
        return maxDocsPerGroup;
118
    }
119
    public void setMaxDocsPerGroup(int maxDocsPerGroup) {
120
        this.maxDocsPerGroup = maxDocsPerGroup;
121
    }
122

    
123
    public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
124
         this.toolProvider = toolProvider;
125
         this.directorySelectClass = directorySelectClass;
126
    }
127
    public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
128
        this.toolProvider = toolProvider;
129
        this.directorySelectClass = directorySelectClass;
130
        this.groupByField = groupByField;
131
    }
132

    
133
    /**
134
     * TODO the abstract base class DescriptionElementBase can not be used, so
135
     * we are using an arbitrary subclass to find the DirectoryProvider, future
136
     * versions of hibernate search my allow using abstract base classes see
137
     * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
138
     *
139
     * @param type must not be null
140
     * @return
141
     */
142
    private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
143
        Class<? extends CdmBase> returnType = type;
144
        if (type.equals(DescriptionElementBase.class)) {
145
            returnType = TextData.class;
146
        }
147
        if (type.equals(TaxonBase.class)) {
148
            returnType = Taxon.class;
149
        }
150
//        if (type.equals(TaxonName.class)) {
151
//            returnType = NonViralName.class;
152
//        }
153
        return returnType;
154
    }
155

    
156
    protected LuceneSearch() {
157

    
158
    }
159

    
160
    public IndexSearcher getSearcher() {
161
        if(searcher == null){
162
            searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
163
//            searcher.setDefaultFieldSortScoring(true, true);
164
        }
165
        return searcher;
166
    }
167

    
168
    /**
169
     * Convenience method which delegated the call to the available
170
     * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
171
     *
172
     * @return the Analyzer suitable for the <code>directorySelectClass</code>
173
     * of the LuceneSearch
174
     */
175
    public Analyzer getAnalyzer() {
176
        return toolProvider.getAnalyzerFor(directorySelectClass);
177
    }
178

    
179
    /**
180
     * @param luceneQueryString
181
     * @param cdmTypeRestriction the type as additional filter criterion
182
     * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
183
     * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
184
     * @return
185
     * @throws ParseException
186
     * @throws IOException
187
     */
188
    public TopGroups<BytesRef> executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
189

    
190
        Query luceneQuery = parse(luceneQueryString);
191
        setQuery(luceneQuery);
192
        return executeSearch(pageSize, pageNumber);
193
    }
194

    
195
    /**
196
     * @param luceneQueryString
197
     * @return
198
     * @throws ParseException
199
     */
200
    public Query parse(String luceneQueryString) throws ParseException {
201
        logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
202
        Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass, false).parse(luceneQueryString);
203
        return luceneQuery;
204
    }
205

    
206
    /**
207
     * @param maxNoOfHits
208
     * @return
209
     * @throws IOException
210
     */
211
    public TopDocs executeSearch(int maxNoOfHits) throws IOException {
212
        BooleanQuery fullQuery = expandQuery();
213
        logger.info("lucene query string to be parsed: " + fullQuery.toString());
214
        return getSearcher().search(fullQuery, maxNoOfHits, Sort.RELEVANCE, true, true);
215

    
216
    }
217
    /**
218
     * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
219
     * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
220
     * @return
221
     * @throws ParseException
222
     * @throws IOException
223
     */
224
    public TopGroups<BytesRef> executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
225

    
226
        if(pageNumber == null || pageNumber < 0){
227
            pageNumber = 0;
228
        }
229
        if(pageSize == null || pageSize <= 0 || pageSize > MAX_HITS_ALLOWED){
230
            pageSize = MAX_HITS_ALLOWED;
231
            logger.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED + " items");
232
        }
233

    
234
        BooleanQuery fullQuery = expandQuery();
235
        logger.info("final query: " + fullQuery.toString());
236

    
237
        int offset = pageNumber * pageSize;
238
        int limit = (pageNumber + 1) * pageSize;
239
        logger.debug("start: " + offset + "; limit:" + limit);
240

    
241
        // sorting
242
        Sort groupSort = null;
243
        Sort withinGroupSort = Sort.RELEVANCE;
244
        if(sortFields != null && sortFields.length > 0){
245
            groupSort = new Sort(sortFields);
246
        } else {
247
            groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
248
        }
249

    
250
        // perform the search (needs two passes for grouping)
251
        if(logger.isDebugEnabled()){
252
            logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
253
                    ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
254
        }
255
        // - first pass
256
        TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, groupSort, limit);
257

    
258
        getSearcher().search(fullQuery, firstPassCollector);
259
        Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
260

    
261
        if (topGroups == null) {
262
              return null;
263
        }
264
        // - flags for second pass
265
        boolean getScores = false;
266
        boolean getMaxScores = true;
267
        if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
268
            getMaxScores = false;
269
            // see inner class TopGroupsWithMaxScore
270
            logger.warn("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
271
        }
272
        boolean fillFields = true;
273
        TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
274
        TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
275
                groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
276
                getMaxScores, fillFields
277
                );
278
        getSearcher().search(fullQuery, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
279

    
280
        TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
281

    
282
        // --- set the max score for the group results
283

    
284
        // get max score from very first result
285
        float maxScore = groupsResult.groups[0].maxScore;
286

    
287
        if(logger.isDebugEnabled()){
288
            logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
289
                    ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
290
                    ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
291
        }
292

    
293
        TopGroups<BytesRef> newTopGroups;
294
        if(offset > 0){
295
            GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[groupsResult.groups.length - offset];
296
            for(int i = offset; i < groupsResult.groups.length; i++){
297
                newGroupDocs[i - offset] = groupsResult.groups[i];
298
            }
299
            newTopGroups = new TopGroups<BytesRef>(
300
                    groupsResult.groupSort,
301
                    groupsResult.withinGroupSort,
302
                    groupsResult.totalHitCount,
303
                    groupsResult.totalGroupedHitCount,
304
                        newGroupDocs,
305
                        maxScore);
306
        } else {
307
            newTopGroups = groupsResult;
308
        }
309
        TopGroups<BytesRef> topGroupsWithMaxScore = new TopGroups<>(newTopGroups, allGroupsCollector.getGroupCount());
310
        // --- done with max score for the group results
311

    
312
        return topGroupsWithMaxScore;
313
    }
314

    
315
    /**
316
     * expands the query by adding a type restriction if the
317
     * <code>cdmTypeRestriction</code> is not <code>NULL</code>
318
     * and adds the <code>filter</code> as Boolean query
319
     * clause with {@link Occur#FILTER}
320
     */
321
    protected BooleanQuery expandQuery() {
322
        BooleanQuery fullQuery = null;
323
        Builder fullQueryBuilder = null;
324

    
325
        if(cdmTypeRestriction != null){
326
            fullQueryBuilder = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
327
        }
328

    
329
        if(filter != null) {
330
            if(fullQueryBuilder == null) {
331
                fullQueryBuilder = new Builder();
332
                fullQueryBuilder.add(this.query, Occur.MUST);
333
            }
334
            fullQueryBuilder.add(filter, Occur.FILTER);
335
        }
336

    
337
        if(fullQueryBuilder != null) {
338
            fullQuery = fullQueryBuilder.build();
339
        } else {
340
            fullQuery = this.query;
341
        }
342

    
343
        logger.debug("expandedQuery: " + fullQuery.toString());
344
        return fullQuery;
345
    }
346

    
347
    public void setQuery(Query query) {
348
        if( query instanceof BooleanQuery) {
349
            this.query = (BooleanQuery)query;
350
        } else {
351
            Builder builder = new Builder();
352
            this.query = builder.add(query, Occur.MUST).build();
353
        }
354
    }
355

    
356
    public BooleanQuery getQuery() {
357
        return query;
358
    }
359

    
360
    public BooleanQuery getExpandedQuery() {
361
        expandQuery();
362
        return query;
363
    }
364

    
365
    public SortField[] getSortFields() {
366
        return sortFields;
367
    }
368

    
369
    public void setSortFields(SortField[] sortFields) {
370
        this.sortFields = sortFields;
371
    }
372

    
373
    public void setHighlightFields(String[] textFieldNamesAsArray) {
374
        this.highlightFields = textFieldNamesAsArray;
375
    }
376

    
377
    public String[] getHighlightFields() {
378
        return this.highlightFields;
379
    }
380

    
381
}
(11-11/16)