package eu.etaxonomy.cdm.api.service.search;
import java.io.IOException;
+import java.util.Collection;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.grouping.GroupDocs;
+import org.apache.lucene.search.grouping.SearchGroup;
+import org.apache.lucene.search.grouping.TermAllGroupsCollector;
+import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector;
+import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector;
+import org.apache.lucene.search.grouping.TopGroups;
import org.hibernate.Session;
+import org.hibernate.search.ProjectionConstants;
import org.hibernate.search.Search;
import org.hibernate.search.SearchFactory;
-import org.hibernate.search.engine.DocumentBuilder;
-import org.hibernate.search.reader.ReaderProvider;
-import org.hibernate.search.store.DirectoryProvider;
+import eu.etaxonomy.cdm.config.Configuration;
import eu.etaxonomy.cdm.model.common.CdmBase;
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.name.NonViralName;
+import eu.etaxonomy.cdm.model.name.TaxonNameBase;
import eu.etaxonomy.cdm.model.taxon.Taxon;
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
*/
public class LuceneSearch {
+ protected String groupByField = "id";
+
+ public final static String ID_FIELD = "id";
+
public static final Logger logger = Logger.getLogger(LuceneSearch.class);
protected Session session;
- protected Searcher searcher;
+ protected IndexSearcher searcher;
- private SortField[] sortFields;
+ protected SortField[] sortFields;
private Class<? extends CdmBase> directorySelectClass;
+ private Filter filter = null;
+
protected Class<? extends CdmBase> getDirectorySelectClass() {
return pushAbstractBaseTypeDown(directorySelectClass);
}
/**
* classFilter
*/
- private Class<? extends CdmBase> clazz;
+ protected Class<? extends CdmBase> clazz;
public Class<? extends CdmBase> getClazz() {
return clazz;
}
+ /**
+ * @return the filter
+ */
+ public Filter getFilter() {
+ return filter;
+ }
+
+ /**
+ * @param filter the filter to set
+ */
+ public void setFilter(Filter filter) {
+ this.filter = filter;
+ }
+
/**
* Sets the Class to use as filter criterion, in case the supplied Class equals the
* <code>directorySelectClass</code> the Class is set to <code>null</code>
protected String[] highlightFields = new String[0];
+ private int maxDocsPerGroup = 10;
+
+
+ public int getMaxDocsPerGroup() {
+ return maxDocsPerGroup;
+ }
+
+ public void setMaxDocsPerGroup(int maxDocsPerGroup) {
+ this.maxDocsPerGroup = maxDocsPerGroup;
+ }
/**
* @param session
this.directorySelectClass = directorySelectClass;
}
+ /**
+ * @param session
+ */
+ public LuceneSearch(Session session, String groupByField, Class<? extends CdmBase> directorySelectClass) {
+ this.session = session;
+ this.directorySelectClass = directorySelectClass;
+ this.groupByField = groupByField;
+ }
+
/**
* TODO the abstract base class DescriptionElementBase can not be used, so
* we are using an arbitraty subclass to find the DirectoryProvider, future
* versions of hibernate search my allow using abstract base classes see
- * http
- * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
- * -a-given-class-in-java
+ * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
*
* @param type must not be null
* @return
if (type.equals(TaxonBase.class)) {
type = Taxon.class;
}
+ if (type.equals(TaxonNameBase.class)) {
+ type = NonViralName.class;
+ }
return type;
}
/**
* @return
*/
- public Searcher getSearcher() {
+ public IndexSearcher getSearcher() {
if(searcher == null){
searcher = new IndexSearcher(getIndexReader());
+ searcher.setDefaultFieldSortScoring(true, true);
}
return searcher;
}
*/
public IndexReader getIndexReader() {
SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
+ IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass());
+ return reader;
+ }
- DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
- logger.info(directoryProviders[0].getDirectory().toString());
-
- ReaderProvider readerProvider = searchFactory.getReaderProvider();
- IndexReader reader = readerProvider.openReader(directoryProviders[0]);
+ /**
+ * @return
+ */
+ public IndexReader getIndexReaderFor(Class<? extends CdmBase> clazz) {
+ SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
+ IndexReader reader = searchFactory.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz));
return reader;
}
*/
public QueryParser getQueryParser() {
Analyzer analyzer = getAnalyzer();
- QueryParser parser = new QueryParser("titleCache", analyzer);
+ QueryParser parser = new QueryParser(Configuration.luceneVersion, "titleCache", analyzer);
return parser;
}
* @throws ParseException
* @throws IOException
*/
- public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+ public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
Query luceneQuery = parse(luceneQueryString);
this.query = luceneQuery;
}
/**
- * @param luceneQuery
- * @param clazz the type as additional filter criterion
+ * @param maxNoOfHits
+ * @return
+ * @throws IOException
+ */
+ public TopDocs executeSearch(int maxNoOfHits) throws IOException {
+ Query fullQuery = expandQuery();
+ logger.info("lucene query string to be parsed: " + fullQuery.toString());
+ return getSearcher().search(fullQuery, filter, maxNoOfHits);
+
+ }
+ /**
* @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
* @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
* @return
* @throws ParseException
* @throws IOException
*/
- public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+ public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
if(pageNumber == null || pageNumber < 0){
}
Query fullQuery = expandQuery();
-
logger.info("final query: " + fullQuery.toString());
- int start = pageNumber * pageSize;
+ int offset = pageNumber * pageSize;
int limit = (pageNumber + 1) * pageSize - 1 ;
+ logger.debug("start: " + offset + "; limit:" + limit);
- logger.debug("start: " + start + "; limit:" + limit);
-
- TopDocs topDocs;
+ // sorting
+ Sort groupSort = null;
+ Sort withinGroupSort = Sort.RELEVANCE;
if(sortFields != null && sortFields.length > 0){
- Sort sort = new Sort(sortFields);
- topDocs = getSearcher().search(fullQuery, null, limit, sort);
+ if(sortFields[0] != SortField.FIELD_SCORE){
+ throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE");
+ }
+ groupSort = new Sort(sortFields);
} else {
- topDocs = getSearcher().search(fullQuery, null, limit);
+ groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
}
+ // perform the search (needs two passes for grouping)
+ if(logger.isDebugEnabled()){
+ logger.debug("Grouping: sortFields=" + sortFields + ", groupByField=" + groupByField +
+ ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
+ }
+ // - first pass
+ TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, withinGroupSort, limit);
- //TODO when switched to Lucene 3.x which is included in hibernate 4.x
- // use TopDocCollector.topDocs(int start, int howMany);
- // since this method might be more memory save than our own implementation
- //
- // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
- //
-// TopDocs topDocs = hitCollector.topDocs();
- ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-
- int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
- logger.debug("docsAvailableInPage:" + docsAvailableInPage);
+ getSearcher().search(fullQuery, filter , firstPassCollector);
+ Collection<SearchGroup<String>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
- ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
- for(int i = 0; i < docsAvailableInPage; i++){
- pagedDocs[i] = scoreDocs[start + i];
+ if (topGroups == null) {
+ return null;
+ }
+ // - second pass
+ boolean getScores = true;
+ boolean getMaxScores = true;
+ boolean fillFields = true;
+ TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
+ TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
+ groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields
+ );
+ getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
+
+ TopGroups<String> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
+
+ // get max score from very first result
+ float maxScore = groupsResult.groups[0].maxScore;
+ if(logger.isDebugEnabled()){
+ logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
+ ", totalGroupCount=" + allGroupsCollector.getGroupCount() + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
}
- TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
- //
- /////////////////////////////////////////////
+ TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore);
- return pagedTopDocs;
+ return topGroupsWithMaxScore;
}
/**
BooleanQuery filteredQuery = new BooleanQuery();
BooleanQuery classFilter = new BooleanQuery();
- Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
+ Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName());
TermQuery termQuery = new TermQuery(t);
classFilter.setBoost(0);
public void setHighlightFields(String[] textFieldNamesAsArray) {
this.highlightFields = textFieldNamesAsArray;
-
}
public String[] getHighlightFields() {
return this.highlightFields;
}
+ /**
+ * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
+ *
+ * @author a.kohlbecker
+ * @date Oct 4, 2012
+ *
+ */
+ public class TopGroupsWithMaxScore{
+ public TopGroups<String> topGroups;
+ public float maxScore = Float.NaN;
+
+ TopGroupsWithMaxScore(TopGroups<String> topGroups, int offset, int totalGroupCount, float maxScore){
+ this.maxScore = maxScore;
+ TopGroups<String> newTopGroups;
+ if(offset > 0){
+ GroupDocs<String>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
+ for(int i = offset; i < topGroups.groups.length; i++){
+ newGroupDocs[i - offset] = topGroups.groups[i];
+ }
+ newTopGroups = new TopGroups<String>(
+ topGroups.groupSort,
+ topGroups.withinGroupSort,
+ topGroups.totalHitCount,
+ topGroups.totalGroupedHitCount,
+ newGroupDocs);
+ } else {
+ newTopGroups = topGroups;
+ }
+ this.topGroups = new TopGroups<String>(newTopGroups, totalGroupCount);
+ }
+
+ }
+
}