3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.api
.service
.search
;
12 import java
.io
.IOException
;
13 import java
.util
.Collection
;
15 import org
.apache
.log4j
.Logger
;
16 import org
.apache
.lucene
.analysis
.Analyzer
;
17 import org
.apache
.lucene
.index
.IndexReader
;
18 import org
.apache
.lucene
.index
.Term
;
19 import org
.apache
.lucene
.queryParser
.ParseException
;
20 import org
.apache
.lucene
.queryParser
.QueryParser
;
21 import org
.apache
.lucene
.search
.BooleanClause
;
22 import org
.apache
.lucene
.search
.BooleanQuery
;
23 import org
.apache
.lucene
.search
.Filter
;
24 import org
.apache
.lucene
.search
.IndexSearcher
;
25 import org
.apache
.lucene
.search
.MultiCollector
;
26 import org
.apache
.lucene
.search
.Query
;
27 import org
.apache
.lucene
.search
.Sort
;
28 import org
.apache
.lucene
.search
.SortField
;
29 import org
.apache
.lucene
.search
.TermQuery
;
30 import org
.apache
.lucene
.search
.TopDocs
;
31 import org
.apache
.lucene
.search
.grouping
.GroupDocs
;
32 import org
.apache
.lucene
.search
.grouping
.SearchGroup
;
33 import org
.apache
.lucene
.search
.grouping
.TermAllGroupsCollector
;
34 import org
.apache
.lucene
.search
.grouping
.TermFirstPassGroupingCollector
;
35 import org
.apache
.lucene
.search
.grouping
.TermSecondPassGroupingCollector
;
36 import org
.apache
.lucene
.search
.grouping
.TopGroups
;
37 import org
.hibernate
.Session
;
38 import org
.hibernate
.search
.ProjectionConstants
;
39 import org
.hibernate
.search
.Search
;
40 import org
.hibernate
.search
.SearchFactory
;
42 import eu
.etaxonomy
.cdm
.config
.Configuration
;
43 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
44 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
45 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
46 import eu
.etaxonomy
.cdm
.model
.name
.NonViralName
;
47 import eu
.etaxonomy
.cdm
.model
.name
.TaxonNameBase
;
48 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
49 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
53 * @author Andreas Kohlbecker
57 public class LuceneSearch
{
59 protected String groupByField
= "id";
61 public final static String ID_FIELD
= "id";
63 public static final Logger logger
= Logger
.getLogger(LuceneSearch
.class);
65 protected Session session
;
67 protected IndexSearcher searcher
;
69 protected SortField
[] sortFields
;
71 private Class
<?
extends CdmBase
> directorySelectClass
;
73 private Filter filter
= null;
75 protected Class
<?
extends CdmBase
> getDirectorySelectClass() {
76 return pushAbstractBaseTypeDown(directorySelectClass
);
82 protected Class
<?
extends CdmBase
> clazz
;
85 public Class
<?
extends CdmBase
> getClazz() {
92 public Filter
getFilter() {
97 * @param filter the filter to set
99 public void setFilter(Filter filter
) {
100 this.filter
= filter
;
104 * Sets the Class to use as filter criterion, in case the supplied Class equals the
105 * <code>directorySelectClass</code> the Class is set to <code>null</code>
108 public void setClazz(Class
<?
extends CdmBase
> clazz
) {
112 * we must not use the getter of directorySelectClass
113 * since we need the abstract base classes here!!!!
115 if(clazz
!= null && clazz
.equals(directorySelectClass
)){
122 * The MAX_HITS_ALLOWED value must be one less than Integer.MAX_VALUE
123 * otherwise PriorityQueue will produce an exception since it
124 * will always add 1 to the maxhits so Integer.MAX_VALUE
125 * would become Integer.MIN_VALUE
127 public final int MAX_HITS_ALLOWED
= 10000;
129 protected Query query
;
131 protected String
[] highlightFields
= new String
[0];
133 private int maxDocsPerGroup
= 10;
136 public int getMaxDocsPerGroup() {
137 return maxDocsPerGroup
;
140 public void setMaxDocsPerGroup(int maxDocsPerGroup
) {
141 this.maxDocsPerGroup
= maxDocsPerGroup
;
147 public LuceneSearch(Session session
, Class
<?
extends CdmBase
> directorySelectClass
) {
148 this.session
= session
;
149 this.directorySelectClass
= directorySelectClass
;
155 public LuceneSearch(Session session
, String groupByField
, Class
<?
extends CdmBase
> directorySelectClass
) {
156 this.session
= session
;
157 this.directorySelectClass
= directorySelectClass
;
158 this.groupByField
= groupByField
;
162 * TODO the abstract base class DescriptionElementBase can not be used, so
163 * we are using an arbitraty subclass to find the DirectoryProvider, future
164 * versions of hibernate search my allow using abstract base classes see
165 * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
167 * @param type must not be null
170 protected Class
<?
extends CdmBase
> pushAbstractBaseTypeDown(Class
<?
extends CdmBase
> type
) {
171 if (type
.equals(DescriptionElementBase
.class)) {
172 type
= TextData
.class;
174 if (type
.equals(TaxonBase
.class)) {
177 if (type
.equals(TaxonNameBase
.class)) {
178 type
= NonViralName
.class;
183 protected LuceneSearch() {
190 public IndexSearcher
getSearcher() {
191 if(searcher
== null){
192 searcher
= new IndexSearcher(getIndexReader());
193 searcher
.setDefaultFieldSortScoring(true, true);
201 public IndexReader
getIndexReader() {
202 SearchFactory searchFactory
= Search
.getFullTextSession(session
).getSearchFactory();
203 IndexReader reader
= searchFactory
.getIndexReaderAccessor().open(getDirectorySelectClass());
210 public IndexReader
getIndexReaderFor(Class
<?
extends CdmBase
> clazz
) {
211 SearchFactory searchFactory
= Search
.getFullTextSession(session
).getSearchFactory();
212 IndexReader reader
= searchFactory
.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz
));
219 public QueryParser
getQueryParser() {
220 Analyzer analyzer
= getAnalyzer();
221 QueryParser parser
= new QueryParser(Configuration
.luceneVersion
, "titleCache", analyzer
);
228 public Analyzer
getAnalyzer() {
229 SearchFactory searchFactory
= Search
.getFullTextSession(session
).getSearchFactory();
230 Analyzer analyzer
= searchFactory
.getAnalyzer(getDirectorySelectClass());
235 * @param luceneQueryString
236 * @param clazz the type as additional filter criterion
237 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
238 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
240 * @throws ParseException
241 * @throws IOException
243 public TopGroupsWithMaxScore
executeSearch(String luceneQueryString
, Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
245 Query luceneQuery
= parse(luceneQueryString
);
246 this.query
= luceneQuery
;
248 return executeSearch(pageSize
, pageNumber
);
252 * @param luceneQueryString
254 * @throws ParseException
256 public Query
parse(String luceneQueryString
) throws ParseException
{
257 logger
.debug("luceneQueryString to be parsed: " + luceneQueryString
);
258 Query luceneQuery
= getQueryParser().parse(luceneQueryString
);
265 * @throws IOException
267 public TopDocs
executeSearch(int maxNoOfHits
) throws IOException
{
268 Query fullQuery
= expandQuery();
269 logger
.info("lucene query string to be parsed: " + fullQuery
.toString());
270 return getSearcher().search(fullQuery
, filter
, maxNoOfHits
);
274 * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
275 * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
277 * @throws ParseException
278 * @throws IOException
280 public TopGroupsWithMaxScore
executeSearch(Integer pageSize
, Integer pageNumber
) throws ParseException
, IOException
{
283 if(pageNumber
== null || pageNumber
< 0){
286 if(pageSize
== null || pageSize
<= 0 || pageSize
> MAX_HITS_ALLOWED
){
287 pageSize
= MAX_HITS_ALLOWED
;
288 logger
.info("limiting pageSize to MAX_HITS_ALLOWED = " + MAX_HITS_ALLOWED
+ " items");
291 Query fullQuery
= expandQuery();
292 logger
.info("final query: " + fullQuery
.toString());
294 int offset
= pageNumber
* pageSize
;
295 int limit
= (pageNumber
+ 1) * pageSize
- 1 ;
296 logger
.debug("start: " + offset
+ "; limit:" + limit
);
299 Sort groupSort
= null;
300 Sort withinGroupSort
= Sort
.RELEVANCE
;
301 if(sortFields
!= null && sortFields
.length
> 0){
302 if(sortFields
[0] != SortField
.FIELD_SCORE
){
303 throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
305 groupSort
= new Sort(sortFields
);
307 groupSort
= Sort
.RELEVANCE
; // == SortField.FIELD_SCORE !!
310 // perform the search (needs two passes for grouping)
311 if(logger
.isDebugEnabled()){
312 logger
.debug("Grouping: sortFields=" + sortFields
+ ", groupByField=" + groupByField
+
313 ", groupSort=" + groupSort
+ ", withinGroupSort=" + withinGroupSort
+ ", limit=" + limit
+ ", maxDocsPerGroup="+ maxDocsPerGroup
);
316 TermFirstPassGroupingCollector firstPassCollector
= new TermFirstPassGroupingCollector(groupByField
, withinGroupSort
, limit
);
318 getSearcher().search(fullQuery
, filter
, firstPassCollector
);
319 Collection
<SearchGroup
<String
>> topGroups
= firstPassCollector
.getTopGroups(0, true); // no offset here since we need the first item for the max score
321 if (topGroups
== null) {
325 boolean getScores
= true;
326 boolean getMaxScores
= true;
327 boolean fillFields
= true;
328 TermAllGroupsCollector allGroupsCollector
= new TermAllGroupsCollector(groupByField
);
329 TermSecondPassGroupingCollector secondPassCollector
= new TermSecondPassGroupingCollector(
330 groupByField
, topGroups
, groupSort
, withinGroupSort
, maxDocsPerGroup
, getScores
, getMaxScores
, fillFields
332 getSearcher().search(fullQuery
, filter
, MultiCollector
.wrap(secondPassCollector
, allGroupsCollector
));
334 TopGroups
<String
> groupsResult
= secondPassCollector
.getTopGroups(0); // no offset here since we need the first item for the max score
336 // get max score from very first result
337 float maxScore
= groupsResult
.groups
[0].maxScore
;
338 if(logger
.isDebugEnabled()){
339 logger
.debug("TopGroups: maxScore=" + maxScore
+ ", offset=" + offset
+
340 ", totalGroupCount=" + allGroupsCollector
.getGroupCount() + ", totalGroupedHitCount=" + groupsResult
.totalGroupedHitCount
);
342 TopGroupsWithMaxScore topGroupsWithMaxScore
= new TopGroupsWithMaxScore(groupsResult
, offset
, allGroupsCollector
.getGroupCount(), maxScore
);
344 return topGroupsWithMaxScore
;
350 protected Query
expandQuery() {
353 BooleanQuery filteredQuery
= new BooleanQuery();
354 BooleanQuery classFilter
= new BooleanQuery();
356 Term t
= new Term(ProjectionConstants
.OBJECT_CLASS
, clazz
.getName());
357 TermQuery termQuery
= new TermQuery(t
);
359 classFilter
.setBoost(0);
360 classFilter
.add(termQuery
, BooleanClause
.Occur
.SHOULD
);
362 filteredQuery
.add(this.query
, BooleanClause
.Occur
.MUST
);
363 filteredQuery
.add(classFilter
, BooleanClause
.Occur
.MUST
);
365 fullQuery
= filteredQuery
;
367 fullQuery
= this.query
;
372 public void setQuery(Query query
) {
376 public Query
getQuery() {
380 public Query
getExpandedQuery() {
385 public SortField
[] getSortFields() {
389 public void setSortFields(SortField
[] sortFields
) {
390 this.sortFields
= sortFields
;
393 public void setHighlightFields(String
[] textFieldNamesAsArray
) {
394 this.highlightFields
= textFieldNamesAsArray
;
397 public String
[] getHighlightFields() {
398 return this.highlightFields
;
402 * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
404 * @author a.kohlbecker
408 public class TopGroupsWithMaxScore
{
409 public TopGroups
<String
> topGroups
;
410 public float maxScore
= Float
.NaN
;
412 TopGroupsWithMaxScore(TopGroups
<String
> topGroups
, int offset
, int totalGroupCount
, float maxScore
){
413 this.maxScore
= maxScore
;
414 TopGroups
<String
> newTopGroups
;
416 GroupDocs
<String
>[] newGroupDocs
= new GroupDocs
[topGroups
.groups
.length
- offset
];
417 for(int i
= offset
; i
< topGroups
.groups
.length
; i
++){
418 newGroupDocs
[i
- offset
] = topGroups
.groups
[i
];
420 newTopGroups
= new TopGroups
<String
>(
422 topGroups
.withinGroupSort
,
423 topGroups
.totalHitCount
,
424 topGroups
.totalGroupedHitCount
,
427 newTopGroups
= topGroups
;
429 this.topGroups
= new TopGroups
<String
>(newTopGroups
, totalGroupCount
);