3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.api
.service
.search
;
12 import java
.io
.IOException
;
13 import java
.util
.Collection
;
15 import org
.apache
.log4j
.Logger
;
16 import org
.apache
.lucene
.analysis
.Analyzer
;
17 import org
.apache
.lucene
.index
.IndexReader
;
18 import org
.apache
.lucene
.index
.Term
;
19 import org
.apache
.lucene
.queryParser
.ParseException
;
20 import org
.apache
.lucene
.queryParser
.QueryParser
;
21 import org
.apache
.lucene
.search
.BooleanClause
;
22 import org
.apache
.lucene
.search
.BooleanQuery
;
23 import org
.apache
.lucene
.search
.IndexSearcher
;
24 import org
.apache
.lucene
.search
.MultiCollector
;
25 import org
.apache
.lucene
.search
.Query
;
26 import org
.apache
.lucene
.search
.Sort
;
27 import org
.apache
.lucene
.search
.SortField
;
28 import org
.apache
.lucene
.search
.TermQuery
;
29 import org
.apache
.lucene
.search
.TopDocs
;
30 import org
.apache
.lucene
.search
.grouping
.GroupDocs
;
31 import org
.apache
.lucene
.search
.grouping
.SearchGroup
;
32 import org
.apache
.lucene
.search
.grouping
.TermAllGroupsCollector
;
33 import org
.apache
.lucene
.search
.grouping
.TermFirstPassGroupingCollector
;
34 import org
.apache
.lucene
.search
.grouping
.TermSecondPassGroupingCollector
;
35 import org
.apache
.lucene
.search
.grouping
.TopGroups
;
36 import org
.hibernate
.Session
;
37 import org
.hibernate
.search
.ProjectionConstants
;
38 import org
.hibernate
.search
.Search
;
39 import org
.hibernate
.search
.SearchFactory
;
41 import eu
.etaxonomy
.cdm
.config
.Configuration
;
42 import eu
.etaxonomy
.cdm
.hibernate
.search
.GroupByTaxonClassBridge
;
43 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
44 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
45 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
46 import eu
.etaxonomy
.cdm
.model
.name
.TaxonNameBase
;
47 import eu
.etaxonomy
.cdm
.model
.name
.NonViralName
;
48 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
49 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
53 * @author Andreas Kohlbecker
57 public class LuceneSearch
{
59 private static final String GROUP_BY_FIELD
= GroupByTaxonClassBridge
.GROUPBY_TAXON_FIELD
;
61 public final static String ID_FIELD
= "id";
63 public static final Logger logger
= Logger
.getLogger(LuceneSearch
.class);
65 protected Session session
;
67 protected IndexSearcher searcher
;
69 private SortField
[] sortFields
;
71 private Class
<?
extends CdmBase
> directorySelectClass
;
73 protected Class
<?
extends CdmBase
> getDirectorySelectClass() {
74 return pushAbstractBaseTypeDown(directorySelectClass
);
80 private Class
<?
extends CdmBase
> clazz
;
83 public Class
<?
extends CdmBase
> getClazz() {
88 * Sets the Class to use as filter criterion, in case the supplied Class equals the
89 * <code>directorySelectClass</code> the Class is set to <code>null</code>
92 public void setClazz(Class
<?
extends CdmBase
> clazz
) {
96 * we must not use the getter of directorySelectClass
97 * since we need the abstract base classes here!!!!
99 if(clazz
!= null && clazz
.equals(directorySelectClass
)){
106 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
107 * otherwise PriorityQueue will produce an exception since it
108 * will always add 1 to the maxhits so Integer.MAX_VALUE
109 * would become Integer.MIN_VALUE
111 public final int MAX_HITS_ALLOWED
= 10000;
113 protected Query query
;
115 protected String
[] highlightFields
= new String
[0];
117 private int maxDocsPerGroup
= 10;
120 public int getMaxDocsPerGroup() {
121 return maxDocsPerGroup
;
124 public void setMaxDocsPerGroup(int maxDocsPerGroup
) {
125 this.maxDocsPerGroup
= maxDocsPerGroup
;
131 public LuceneSearch(Session session
, Class
<?
extends CdmBase
> directorySelectClass
) {
132 this.session
= session
;
133 this.directorySelectClass
= directorySelectClass
;
137 * TODO the abstract base class DescriptionElementBase can not be used, so
138 * we are using an arbitraty subclass to find the DirectoryProvider, future
139 * versions of hibernate search my allow using abstract base classes see
141 * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
142 * -a-given-class-in-java
144 * @param type must not be null
147 protected Class
<?
extends CdmBase
> pushAbstractBaseTypeDown(Class
<?
extends CdmBase
> type
) {
148 if (type
.equals(DescriptionElementBase
.class)) {
149 type
= TextData
.class;
151 if (type
.equals(TaxonBase
.class)) {
154 if (type
.equals(TaxonNameBase
.class)) {
155 type
= NonViralName
.class;
160 protected LuceneSearch() {
167 public IndexSearcher
getSearcher() {
168 if(searcher
== null){
169 searcher
= new IndexSearcher(getIndexReader());
170 searcher
.setDefaultFieldSortScoring(true, true);
178 public IndexReader
getIndexReader() {
179 SearchFactory searchFactory
= Search
.getFullTextSession(session
).getSearchFactory();
182 // DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
183 // logger.info(directoryProviders[0].getDirectory().toString());
185 // ReaderProvider readerProvider = searchFactory.getReaderProvider();
186 // IndexReader reader = readerProvider.openReader(directoryProviders[0]);
188 IndexReader reader
= searchFactory
.getIndexReaderAccessor().open(getDirectorySelectClass());
195 public QueryParser
getQueryParser() {
196 Analyzer analyzer
= getAnalyzer();
197 QueryParser parser
= new QueryParser(Configuration
.luceneVersion
, "titleCache", analyzer
);
204 public Analyzer
getAnalyzer() {
205 SearchFactory searchFactory
= Search
.getFullTextSession(session
).getSearchFactory();
206 Analyzer analyzer
= searchFactory
.getAnalyzer(getDirectorySelectClass());
211 * @param luceneQueryString
212 * @param clazz the type as additional filter criterion
213 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
214 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
216 * @throws ParseException
217 * @throws IOException
219 public TopGroupsWithMaxScore
executeSearch(String luceneQueryString
, Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
221 Query luceneQuery
= parse(luceneQueryString
);
222 this.query
= luceneQuery
;
224 return executeSearch(pageSize
, pageNumber
);
228 * @param luceneQueryString
230 * @throws ParseException
232 public Query
parse(String luceneQueryString
) throws ParseException
{
233 logger
.debug("luceneQueryString to be parsed: " + luceneQueryString
);
234 Query luceneQuery
= getQueryParser().parse(luceneQueryString
);
241 * @throws IOException
243 public TopDocs
executeSearch(int maxNoOfHits
) throws IOException
{
244 Query fullQuery
= expandQuery();
245 logger
.info("lucene query string to be parsed: " + fullQuery
.toString());
246 return getSearcher().search(fullQuery
, maxNoOfHits
);
251 * @param clazz the type as additional filter criterion
252 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
253 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
255 * @throws ParseException
256 * @throws IOException
258 public TopGroupsWithMaxScore
executeSearch(Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
261 if(pageNumber
== null || pageNumber
< 0){
264 if(pageSize
== null || pageSize
<= 0 || pageSize
> MAX_HITS_ALLOWED
){
265 pageSize
= MAX_HITS_ALLOWED
;
266 logger
.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED
+ " items");
269 Query fullQuery
= expandQuery();
270 logger
.info("final query: " + fullQuery
.toString());
272 int offset
= pageNumber
* pageSize
;
273 int limit
= (pageNumber
+ 1) * pageSize
- 1 ;
274 logger
.debug("start: " + offset
+ "; limit:" + limit
);
277 Sort groupSort
= null;
278 Sort withinGroupSort
= Sort
.RELEVANCE
;
279 if(sortFields
!= null && sortFields
.length
> 0){
280 if(sortFields
[0] != SortField
.FIELD_SCORE
){
281 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
283 groupSort
= new Sort(sortFields
);
285 groupSort
= Sort
.RELEVANCE
; // == SortField.FIELD_SCORE !!
288 // perform the search (needs two passes for grouping)
290 TermFirstPassGroupingCollector firstPassCollector
= new TermFirstPassGroupingCollector(GROUP_BY_FIELD
, withinGroupSort
, limit
);
291 getSearcher().search(fullQuery
, firstPassCollector
);
292 Collection
<SearchGroup
<String
>> topGroups
= firstPassCollector
.getTopGroups(0, true); // no offset here since we need the first item for the max score
294 if (topGroups
== null) {
298 boolean getScores
= true;
299 boolean getMaxScores
= true;
300 boolean fillFields
= true;
301 TermAllGroupsCollector allGroupsCollector
= new TermAllGroupsCollector(GROUP_BY_FIELD
);
302 TermSecondPassGroupingCollector secondPassCollector
= new TermSecondPassGroupingCollector(GROUP_BY_FIELD
, topGroups
, groupSort
, withinGroupSort
, maxDocsPerGroup
, getScores
, getMaxScores
, fillFields
);
303 getSearcher().search(fullQuery
, MultiCollector
.wrap(secondPassCollector
, allGroupsCollector
));
305 TopGroups
<String
> groupsResult
= secondPassCollector
.getTopGroups(0); // no offset here since we need the first item for the max score
307 // get max score from very first result
308 float maxScore
= groupsResult
.groups
[0].maxScore
;
309 TopGroupsWithMaxScore topGroupsWithMaxScore
= new TopGroupsWithMaxScore(groupsResult
, offset
, allGroupsCollector
.getGroupCount(), maxScore
);
311 return topGroupsWithMaxScore
;
317 protected Query
expandQuery() {
320 BooleanQuery filteredQuery
= new BooleanQuery();
321 BooleanQuery classFilter
= new BooleanQuery();
323 Term t
= new Term(ProjectionConstants
.OBJECT_CLASS
, clazz
.getName());
324 TermQuery termQuery
= new TermQuery(t
);
326 classFilter
.setBoost(0);
327 classFilter
.add(termQuery
, BooleanClause
.Occur
.SHOULD
);
329 filteredQuery
.add(this.query
, BooleanClause
.Occur
.MUST
);
330 filteredQuery
.add(classFilter
, BooleanClause
.Occur
.MUST
);
332 fullQuery
= filteredQuery
;
334 fullQuery
= this.query
;
339 public void setQuery(Query query
) {
343 public Query
getQuery() {
347 public Query
getExpandedQuery() {
352 public SortField
[] getSortFields() {
356 public void setSortFields(SortField
[] sortFields
) {
357 this.sortFields
= sortFields
;
360 public void setHighlightFields(String
[] textFieldNamesAsArray
) {
361 this.highlightFields
= textFieldNamesAsArray
;
364 public String
[] getHighlightFields() {
365 return this.highlightFields
;
369 * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
371 * @author a.kohlbecker
375 public class TopGroupsWithMaxScore
{
376 public TopGroups
<String
> topGroups
;
377 public float maxScore
= Float
.NaN
;
379 TopGroupsWithMaxScore(TopGroups
<String
> topGroups
, int offset
, int totalGroupCount
, float maxScore
){
380 this.maxScore
= maxScore
;
381 TopGroups
<String
> newTopGroups
;
383 GroupDocs
<String
>[] newGroupDocs
= new GroupDocs
[topGroups
.groups
.length
- offset
];
384 for(int i
= offset
; i
< topGroups
.groups
.length
; i
++){
385 newGroupDocs
[i
- offset
] = topGroups
.groups
[i
];
387 newTopGroups
= new TopGroups
<String
>(
389 topGroups
.withinGroupSort
,
390 topGroups
.totalHitCount
,
391 topGroups
.totalGroupedHitCount
,
394 newTopGroups
= topGroups
;
396 this.topGroups
= new TopGroups
<String
>(newTopGroups
, totalGroupCount
);