package eu.etaxonomy.cdm.api.service.search;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.BooleanQuery.Builder;
+import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
-import org.hibernate.Session;
-import org.hibernate.search.Search;
-import org.hibernate.search.SearchFactory;
-import org.hibernate.search.engine.DocumentBuilder;
-import org.hibernate.search.reader.ReaderProvider;
-import org.hibernate.search.store.DirectoryProvider;
+import org.apache.lucene.search.grouping.GroupDocs;
+import org.apache.lucene.search.grouping.SearchGroup;
+import org.apache.lucene.search.grouping.TopGroups;
+import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
+import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
+import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
+import org.apache.lucene.util.BytesRef;
import eu.etaxonomy.cdm.model.common.CdmBase;
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.name.NonViralName;
+import eu.etaxonomy.cdm.model.name.TaxonNameBase;
import eu.etaxonomy.cdm.model.taxon.Taxon;
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
*/
public class LuceneSearch {
+ protected String groupByField = "id";
+
+ public final static String ID_FIELD = "id";
+
public static final Logger logger = Logger.getLogger(LuceneSearch.class);
- protected Session session;
+ protected ILuceneIndexToolProvider toolProvider;
- protected Searcher searcher;
+ protected IndexSearcher searcher;
- private SortField[] sortFields;
+ protected SortField[] sortFields;
private Class<? extends CdmBase> directorySelectClass;
+ private Filter filter = null;
+
protected Class<? extends CdmBase> getDirectorySelectClass() {
return pushAbstractBaseTypeDown(directorySelectClass);
}
/**
* classFilter
*/
- private Class<? extends CdmBase> clazz;
+ protected Class<? extends CdmBase> cdmTypeRestriction;
- public Class<? extends CdmBase> getClazz() {
- return clazz;
+ public Class<? extends CdmBase> getCdmTypRestriction() {
+ return cdmTypeRestriction;
+ }
+
+ /**
+ * @return the filter
+ */
+ public Filter getFilter() {
+ return filter;
+ }
+
+ /**
+ * @param filter the filter to set
+ */
+ public void setFilter(Filter filter) {
+ this.filter = filter;
}
/**
* <code>directorySelectClass</code> the Class is set to <code>null</code>
* @param clazz
*/
- public void setClazz(Class<? extends CdmBase> clazz) {
+ public void setCdmTypRestriction(Class<? extends CdmBase> clazz) {
/*
* NOTE:
if(clazz != null && clazz.equals(directorySelectClass)){
clazz = null;
}
- this.clazz = clazz;
+ this.cdmTypeRestriction = clazz;
}
/**
*/
public final int MAX_HITS_ALLOWED = 10000;
- protected Query query;
+ protected BooleanQuery query;
protected String[] highlightFields = new String[0];
+ private int maxDocsPerGroup = 10;
+
+
+ public int getMaxDocsPerGroup() {
+ return maxDocsPerGroup;
+ }
+
+ public void setMaxDocsPerGroup(int maxDocsPerGroup) {
+ this.maxDocsPerGroup = maxDocsPerGroup;
+ }
/**
* @param session
*/
- public LuceneSearch(Session session, Class<? extends CdmBase> directorySelectClass) {
- this.session = session;
+ public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class<? extends CdmBase> directorySelectClass) {
+ this.toolProvider = toolProvider;
this.directorySelectClass = directorySelectClass;
}
+ /**
+ * @param session
+ */
+ public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class<? extends CdmBase> directorySelectClass) {
+ this.toolProvider = toolProvider;
+ this.directorySelectClass = directorySelectClass;
+ this.groupByField = groupByField;
+ }
+
/**
* TODO the abstract base class DescriptionElementBase can not be used, so
- * we are using an arbitraty subclass to find the DirectoryProvider, future
+ * we are using an arbitrary subclass to find the DirectoryProvider, future
* versions of hibernate search my allow using abstract base classes see
- * http
- * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of
- * -a-given-class-in-java
+ * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java}
*
* @param type must not be null
* @return
*/
- protected Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
+ private Class<? extends CdmBase> pushAbstractBaseTypeDown(Class<? extends CdmBase> type) {
+ Class<? extends CdmBase> returnType = type;
if (type.equals(DescriptionElementBase.class)) {
- type = TextData.class;
+ returnType = TextData.class;
}
if (type.equals(TaxonBase.class)) {
- type = Taxon.class;
+ returnType = Taxon.class;
}
- return type;
+ if (type.equals(TaxonNameBase.class)) {
+ returnType = NonViralName.class;
+ }
+ return returnType;
}
protected LuceneSearch() {
/**
* @return
*/
- public Searcher getSearcher() {
+ public IndexSearcher getSearcher() {
if(searcher == null){
- searcher = new IndexSearcher(getIndexReader());
+ searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass));
+// searcher.setDefaultFieldSortScoring(true, true);
}
return searcher;
}
/**
- * @return
- */
- public IndexReader getIndexReader() {
- SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
-
- DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass());
- logger.info(directoryProviders[0].getDirectory().toString());
-
- ReaderProvider readerProvider = searchFactory.getReaderProvider();
- IndexReader reader = readerProvider.openReader(directoryProviders[0]);
- return reader;
- }
-
- /**
- * @return
- */
- public QueryParser getQueryParser() {
- Analyzer analyzer = getAnalyzer();
- QueryParser parser = new QueryParser("titleCache", analyzer);
- return parser;
- }
-
- /**
- * @return
+ * Convenience method which delegated the call to the available
+ * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method.
+ *
+ * @return the Analyzer suitable for the <code>directorySelectClass</code>
+ * of the LuceneSearch
*/
public Analyzer getAnalyzer() {
- SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory();
- Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass());
- return analyzer;
+ return toolProvider.getAnalyzerFor(directorySelectClass);
}
/**
* @param luceneQueryString
- * @param clazz the type as additional filter criterion
+ * @param cdmTypeRestriction the type as additional filter criterion
* @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
* @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
* @return
* @throws ParseException
* @throws IOException
*/
- public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+ public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException {
Query luceneQuery = parse(luceneQueryString);
- this.query = luceneQuery;
-
+ setQuery(luceneQuery);
return executeSearch(pageSize, pageNumber);
}
*/
public Query parse(String luceneQueryString) throws ParseException {
logger.debug("luceneQueryString to be parsed: " + luceneQueryString);
- Query luceneQuery = getQueryParser().parse(luceneQueryString);
+ Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass).parse(luceneQueryString);
return luceneQuery;
}
/**
- * @param luceneQuery
- * @param clazz the type as additional filter criterion
+ * @param maxNoOfHits
+ * @return
+ * @throws IOException
+ */
+ public TopDocs executeSearch(int maxNoOfHits) throws IOException {
+ Query fullQuery = expandQuery();
+ logger.info("lucene query string to be parsed: " + fullQuery.toString());
+ return getSearcher().search(fullQuery, filter, maxNoOfHits);
+
+ }
+ /**
* @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED
* @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative.
* @return
* @throws ParseException
* @throws IOException
*/
- public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
+ public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException {
if(pageNumber == null || pageNumber < 0){
}
Query fullQuery = expandQuery();
-
logger.info("final query: " + fullQuery.toString());
- int start = pageNumber * pageSize;
+ int offset = pageNumber * pageSize;
int limit = (pageNumber + 1) * pageSize - 1 ;
+ logger.debug("start: " + offset + "; limit:" + limit);
- logger.debug("start: " + start + "; limit:" + limit);
-
- TopDocs topDocs;
+ // sorting
+ Sort groupSort = null;
+ Sort withinGroupSort = Sort.RELEVANCE;
if(sortFields != null && sortFields.length > 0){
- Sort sort = new Sort(sortFields);
- topDocs = getSearcher().search(fullQuery, null, limit, sort);
+ groupSort = new Sort(sortFields);
} else {
- topDocs = getSearcher().search(fullQuery, null, limit);
+ groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !!
}
+ // perform the search (needs two passes for grouping)
+ if(logger.isDebugEnabled()){
+ logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField +
+ ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup);
+ }
+ // - first pass
+ TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(
+ groupByField, groupSort, limit);
- //TODO when switched to Lucene 3.x which is included in hibernate 4.x
- // use TopDocCollector.topDocs(int start, int howMany);
- // since this method might be more memory save than our own implementation
- //
- // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!!
- //
-// TopDocs topDocs = hitCollector.topDocs();
- ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-
- int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize);
- logger.debug("docsAvailableInPage:" + docsAvailableInPage);
+ getSearcher().search(fullQuery, filter , firstPassCollector);
+ Collection<SearchGroup<BytesRef>> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score
- ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage];
- for(int i = 0; i < docsAvailableInPage; i++){
- pagedDocs[i] = scoreDocs[start + i];
+ if (topGroups == null) {
+ return null;
+ }
+ // - flags for second pass
+ boolean getScores = false;
+ boolean getMaxScores = true;
+ if(groupSort.getSort()[0] != SortField.FIELD_SCORE){
+ getMaxScores = false;
+ // see inner class TopGroupsWithMaxScore
+ logger.error("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped");
+ }
+ boolean fillFields = true;
+ TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField);
+ TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector(
+ groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores,
+ getMaxScores, fillFields
+ );
+ getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector));
+
+ TopGroups<BytesRef> groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score
+
+ // get max score from very first result
+ float maxScore = groupsResult.groups[0].maxScore;
+ if(logger.isDebugEnabled()){
+ logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset +
+ ", totalGroupCount=" + allGroupsCollector.getGroupCount() +
+ ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
}
- TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore());
- //
- /////////////////////////////////////////////
+ TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult,
+ offset, allGroupsCollector.getGroupCount(), maxScore);
- return pagedTopDocs;
+ return topGroupsWithMaxScore;
}
/**
- * @param clazz
+ * expands the query by adding a type restriction if the
+ * <code>cdmTypeRestriction</code> is not <code>NULL</code>
*/
protected Query expandQuery() {
- Query fullQuery;
- if(clazz != null){
- BooleanQuery filteredQuery = new BooleanQuery();
- BooleanQuery classFilter = new BooleanQuery();
-
- Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName());
- TermQuery termQuery = new TermQuery(t);
-
- classFilter.setBoost(0);
- classFilter.add(termQuery, BooleanClause.Occur.SHOULD);
-
- filteredQuery.add(this.query, BooleanClause.Occur.MUST);
- filteredQuery.add(classFilter, BooleanClause.Occur.MUST);
-
- fullQuery = filteredQuery;
+ BooleanQuery fullQuery;
+ if(cdmTypeRestriction != null){
+ fullQuery = QueryFactory.addTypeRestriction(query, cdmTypeRestriction);
} else {
fullQuery = this.query;
}
}
public void setQuery(Query query) {
- this.query = query;
+ if( query instanceof BooleanQuery) {
+ this.query = (BooleanQuery)query;
+ } else {
+ Builder builder = new Builder();
+ this.query = builder.add(query, Occur.MUST).build();
+ }
}
- public Query getQuery() {
+ public BooleanQuery getQuery() {
return query;
}
public void setHighlightFields(String[] textFieldNamesAsArray) {
this.highlightFields = textFieldNamesAsArray;
-
}
public String[] getHighlightFields() {
return this.highlightFields;
}
+ /**
+ * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore.
+ *
+ * @author a.kohlbecker
+ * @date Oct 4, 2012
+ *
+ */
+ public class TopGroupsWithMaxScore{
+ public TopGroups<BytesRef> topGroups;
+ public float maxScore = Float.NaN;
+
+ TopGroupsWithMaxScore(TopGroups<BytesRef> topGroups, int offset, int totalGroupCount, float maxScore){
+ this.maxScore = maxScore;
+ TopGroups<BytesRef> newTopGroups;
+ if(offset > 0){
+ GroupDocs<BytesRef>[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset];
+ for(int i = offset; i < topGroups.groups.length; i++){
+ newGroupDocs[i - offset] = topGroups.groups[i];
+ }
+ newTopGroups = new TopGroups<BytesRef>(
+ topGroups.groupSort,
+ topGroups.withinGroupSort,
+ topGroups.totalHitCount,
+ topGroups.totalGroupedHitCount,
+ newGroupDocs,
+ maxScore);
+ } else {
+ newTopGroups = topGroups;
+ }
+ this.topGroups = new TopGroups<BytesRef>(newTopGroups, totalGroupCount);
+ }
+
+ }
+
}