2 * Copyright (C) 2011 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
9 package eu
.etaxonomy
.cdm
.api
.service
.search
;
11 import java
.io
.IOException
;
12 import java
.util
.Arrays
;
13 import java
.util
.Collection
;
15 import org
.apache
.logging
.log4j
.LogManager
;
16 import org
.apache
.logging
.log4j
.Logger
;
17 import org
.apache
.lucene
.analysis
.Analyzer
;
18 import org
.apache
.lucene
.queryparser
.classic
.ParseException
;
19 import org
.apache
.lucene
.search
.BooleanClause
.Occur
;
20 import org
.apache
.lucene
.search
.BooleanQuery
;
21 import org
.apache
.lucene
.search
.BooleanQuery
.Builder
;
22 import org
.apache
.lucene
.search
.IndexSearcher
;
23 import org
.apache
.lucene
.search
.MultiCollector
;
24 import org
.apache
.lucene
.search
.Query
;
25 import org
.apache
.lucene
.search
.Sort
;
26 import org
.apache
.lucene
.search
.SortField
;
27 import org
.apache
.lucene
.search
.TopDocs
;
28 import org
.apache
.lucene
.search
.grouping
.GroupDocs
;
29 import org
.apache
.lucene
.search
.grouping
.SearchGroup
;
30 import org
.apache
.lucene
.search
.grouping
.TopGroups
;
31 import org
.apache
.lucene
.search
.grouping
.term
.TermAllGroupsCollector
;
32 import org
.apache
.lucene
.search
.grouping
.term
.TermFirstPassGroupingCollector
;
33 import org
.apache
.lucene
.search
.grouping
.term
.TermSecondPassGroupingCollector
;
34 import org
.apache
.lucene
.util
.BytesRef
;
36 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
37 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
38 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
40 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
43 * @author Andreas Kohlbecker
46 public class LuceneSearch
{
48 private static final Logger logger
= LogManager
.getLogger();
50 public final static String ID_FIELD
= "id";
53 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
54 * otherwise PriorityQueue will produce an exception since it
55 * will always add 1 to the maxhits so Integer.MAX_VALUE
56 * would become Integer.MIN_VALUE
58 public final int MAX_HITS_ALLOWED
= 10000;
61 protected String groupByField
= "id";
63 protected ILuceneIndexToolProvider toolProvider
;
65 protected IndexSearcher searcher
;
67 protected SortField
[] sortFields
;
69 private Class
<?
extends CdmBase
> directorySelectClass
;
71 private BooleanQuery filter
= null;
74 protected Class
<?
extends CdmBase
> cdmTypeRestriction
;
76 protected BooleanQuery query
;
78 protected String
[] highlightFields
= new String
[0];
80 private int maxDocsPerGroup
= 10;
82 protected Class
<?
extends CdmBase
> getDirectorySelectClass() {
83 return pushAbstractBaseTypeDown(directorySelectClass
);
86 public Class
<?
extends CdmBase
> getCdmTypRestriction() {
87 return cdmTypeRestriction
;
91 public BooleanQuery
getFilter() {
94 public void setFilter(BooleanQuery filter
) {
99 * Sets the Class to use as filter criterion, in case the supplied Class equals the
100 * <code>directorySelectClass</code> the Class is set to <code>null</code>
103 public void setCdmTypRestriction(Class
<?
extends CdmBase
> clazz
) {
107 * we must not use the getter of directorySelectClass
108 * since we need the abstract base classes here!!!!
110 if(clazz
!= null && clazz
.equals(directorySelectClass
)){
113 this.cdmTypeRestriction
= clazz
;
116 public int getMaxDocsPerGroup() {
117 return maxDocsPerGroup
;
119 public void setMaxDocsPerGroup(int maxDocsPerGroup
) {
120 this.maxDocsPerGroup
= maxDocsPerGroup
;
123 public LuceneSearch(ILuceneIndexToolProvider toolProvider
, Class
<?
extends CdmBase
> directorySelectClass
) {
124 this.toolProvider
= toolProvider
;
125 this.directorySelectClass
= directorySelectClass
;
127 public LuceneSearch(ILuceneIndexToolProvider toolProvider
, String groupByField
, Class
<?
extends CdmBase
> directorySelectClass
) {
128 this.toolProvider
= toolProvider
;
129 this.directorySelectClass
= directorySelectClass
;
130 this.groupByField
= groupByField
;
134 * TODO the abstract base class DescriptionElementBase can not be used, so
135 * we are using an arbitrary subclass to find the DirectoryProvider, future
136 * versions of hibernate search my allow using abstract base classes see
137 * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
139 * @param type must not be null
142 private Class
<?
extends CdmBase
> pushAbstractBaseTypeDown(Class
<?
extends CdmBase
> type
) {
143 Class
<?
extends CdmBase
> returnType
= type
;
144 if (type
.equals(DescriptionElementBase
.class)) {
145 returnType
= TextData
.class;
147 if (type
.equals(TaxonBase
.class)) {
148 returnType
= Taxon
.class;
150 // if (type.equals(TaxonName.class)) {
151 // returnType = NonViralName.class;
156 protected LuceneSearch() {
160 public IndexSearcher
getSearcher() {
161 if(searcher
== null){
162 searcher
= new IndexSearcher(toolProvider
.getIndexReaderFor(directorySelectClass
));
163 // searcher.setDefaultFieldSortScoring(true, true);
169 * Convenience method which delegated the call to the available
170 * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
172 * @return the Analyzer suitable for the <code>directorySelectClass</code>
173 * of the LuceneSearch
175 public Analyzer
getAnalyzer() {
176 return toolProvider
.getAnalyzerFor(directorySelectClass
);
180 * @param luceneQueryString
181 * @param cdmTypeRestriction the type as additional filter criterion
182 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
183 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
185 * @throws ParseException
186 * @throws IOException
188 public TopGroups
<BytesRef
> executeSearch(String luceneQueryString
, Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
190 Query luceneQuery
= parse(luceneQueryString
);
191 setQuery(luceneQuery
);
192 return executeSearch(pageSize
, pageNumber
);
196 * @param luceneQueryString
198 * @throws ParseException
200 public Query
parse(String luceneQueryString
) throws ParseException
{
201 logger
.debug("luceneQueryString to be parsed: " + luceneQueryString
);
202 Query luceneQuery
= toolProvider
.getQueryParserFor(directorySelectClass
, false).parse(luceneQueryString
);
209 * @throws IOException
211 public TopDocs
executeSearch(int maxNoOfHits
) throws IOException
{
212 BooleanQuery fullQuery
= expandQuery();
213 logger
.info("lucene query string to be parsed: " + fullQuery
.toString());
214 return getSearcher().search(fullQuery
, maxNoOfHits
, Sort
.RELEVANCE
, true, true);
218 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
219 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
221 * @throws ParseException
222 * @throws IOException
224 public TopGroups
<BytesRef
> executeSearch(Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
226 if(pageNumber
== null || pageNumber
< 0){
229 if(pageSize
== null || pageSize
<= 0 || pageSize
> MAX_HITS_ALLOWED
){
230 pageSize
= MAX_HITS_ALLOWED
;
231 logger
.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED
+ " items");
234 BooleanQuery fullQuery
= expandQuery();
235 logger
.info("final query: " + fullQuery
.toString());
237 int offset
= pageNumber
* pageSize
;
238 int limit
= (pageNumber
+ 1) * pageSize
;
239 logger
.debug("start: " + offset
+ "; limit:" + limit
);
242 Sort groupSort
= null;
243 Sort withinGroupSort
= Sort
.RELEVANCE
;
244 if(sortFields
!= null && sortFields
.length
> 0){
245 groupSort
= new Sort(sortFields
);
247 groupSort
= Sort
.RELEVANCE
; // == SortField.FIELD_SCORE !!
250 // perform the search (needs two passes for grouping)
251 if(logger
.isDebugEnabled()){
252 logger
.debug("Grouping: sortFields=" + Arrays
.toString(sortFields
) + ", groupByField=" + groupByField
+
253 ", groupSort=" + groupSort
+ ", withinGroupSort=" + withinGroupSort
+ ", limit=" + limit
+ ", maxDocsPerGroup="+ maxDocsPerGroup
);
256 TermFirstPassGroupingCollector firstPassCollector
= new TermFirstPassGroupingCollector(groupByField
, groupSort
, limit
);
258 getSearcher().search(fullQuery
, firstPassCollector
);
259 Collection
<SearchGroup
<BytesRef
>> topGroups
= firstPassCollector
.getTopGroups(0, true); // no offset here since we need the first item for the max score
261 if (topGroups
== null) {
264 // - flags for second pass
265 boolean getScores
= false;
266 boolean getMaxScores
= true;
267 if(groupSort
.getSort()[0] != SortField
.FIELD_SCORE
){
268 getMaxScores
= false;
269 // see inner class TopGroupsWithMaxScore
270 logger
.warn("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
272 boolean fillFields
= true;
273 TermAllGroupsCollector allGroupsCollector
= new TermAllGroupsCollector(groupByField
);
274 TermSecondPassGroupingCollector secondPassCollector
= new TermSecondPassGroupingCollector(
275 groupByField
, topGroups
, groupSort
, withinGroupSort
, maxDocsPerGroup
, getScores
,
276 getMaxScores
, fillFields
278 getSearcher().search(fullQuery
, MultiCollector
.wrap(secondPassCollector
, allGroupsCollector
));
280 TopGroups
<BytesRef
> groupsResult
= secondPassCollector
.getTopGroups(0); // no offset here since we need the first item for the max score
282 // --- set the max score for the group results
284 // get max score from very first result
285 float maxScore
= groupsResult
.groups
[0].maxScore
;
287 if(logger
.isDebugEnabled()){
288 logger
.debug("TopGroups: maxScore=" + maxScore
+ ", offset=" + offset
+
289 ", totalGroupCount=" + allGroupsCollector
.getGroupCount() +
290 ", totalGroupedHitCount=" + groupsResult
.totalGroupedHitCount
);
293 TopGroups
<BytesRef
> newTopGroups
;
295 GroupDocs
<BytesRef
>[] newGroupDocs
= new GroupDocs
[groupsResult
.groups
.length
- offset
];
296 for(int i
= offset
; i
< groupsResult
.groups
.length
; i
++){
297 newGroupDocs
[i
- offset
] = groupsResult
.groups
[i
];
299 newTopGroups
= new TopGroups
<BytesRef
>(
300 groupsResult
.groupSort
,
301 groupsResult
.withinGroupSort
,
302 groupsResult
.totalHitCount
,
303 groupsResult
.totalGroupedHitCount
,
307 newTopGroups
= groupsResult
;
309 TopGroups
<BytesRef
> topGroupsWithMaxScore
= new TopGroups
<>(newTopGroups
, allGroupsCollector
.getGroupCount());
310 // --- done with max score for the group results
312 return topGroupsWithMaxScore
;
316 * expands the query by adding a type restriction if the
317 * <code>cdmTypeRestriction</code> is not <code>NULL</code>
318 * and adds the <code>filter</code> as Boolean query
319 * clause with {@link Occur#FILTER}
321 protected BooleanQuery
expandQuery() {
322 BooleanQuery fullQuery
= null;
323 Builder fullQueryBuilder
= null;
325 if(cdmTypeRestriction
!= null){
326 fullQueryBuilder
= QueryFactory
.addTypeRestriction(query
, cdmTypeRestriction
);
330 if(fullQueryBuilder
== null) {
331 fullQueryBuilder
= new Builder();
332 fullQueryBuilder
.add(this.query
, Occur
.MUST
);
334 fullQueryBuilder
.add(filter
, Occur
.FILTER
);
337 if(fullQueryBuilder
!= null) {
338 fullQuery
= fullQueryBuilder
.build();
340 fullQuery
= this.query
;
343 logger
.debug("expandedQuery: " + fullQuery
.toString());
347 public void setQuery(Query query
) {
348 if( query
instanceof BooleanQuery
) {
349 this.query
= (BooleanQuery
)query
;
351 Builder builder
= new Builder();
352 this.query
= builder
.add(query
, Occur
.MUST
).build();
356 public BooleanQuery
getQuery() {
360 public BooleanQuery
getExpandedQuery() {
365 public SortField
[] getSortFields() {
369 public void setSortFields(SortField
[] sortFields
) {
370 this.sortFields
= sortFields
;
373 public void setHighlightFields(String
[] textFieldNamesAsArray
) {
374 this.highlightFields
= textFieldNamesAsArray
;
377 public String
[] getHighlightFields() {
378 return this.highlightFields
;