X-Git-Url: https://dev.e-taxonomy.eu/gitweb/cdmlib.git/blobdiff_plain/fd93c8b355df3320be4afacdb9460f370d36e149..f6b282fc9660b250f81879ec6b255517f13ec229:/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java index 6a93c0086b..57d338a00b 100644 --- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java +++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java @@ -10,34 +10,35 @@ package eu.etaxonomy.cdm.api.service.search; import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Hits; +import org.apache.lucene.search.BooleanQuery.Builder; +import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.hibernate.Session; -import org.hibernate.search.Search; -import org.hibernate.search.SearchFactory; -import org.hibernate.search.engine.DocumentBuilder; -import org.hibernate.search.reader.ReaderProvider; -import org.hibernate.search.store.DirectoryProvider; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.search.grouping.term.TermAllGroupsCollector; +import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector; +import org.apache.lucene.util.BytesRef; import eu.etaxonomy.cdm.model.common.CdmBase; import eu.etaxonomy.cdm.model.description.DescriptionElementBase; import eu.etaxonomy.cdm.model.description.TextData; +import eu.etaxonomy.cdm.model.name.NonViralName; +import eu.etaxonomy.cdm.model.name.TaxonNameBase; import eu.etaxonomy.cdm.model.taxon.Taxon; import eu.etaxonomy.cdm.model.taxon.TaxonBase; @@ -49,16 +50,22 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase; */ public class LuceneSearch { + protected String groupByField = "id"; + + public final static String ID_FIELD = "id"; + public static final Logger logger = Logger.getLogger(LuceneSearch.class); - protected Session session; + protected ILuceneIndexToolProvider toolProvider; - protected Searcher searcher; + protected IndexSearcher searcher; - private SortField[] sortFields; + protected SortField[] sortFields; private Class directorySelectClass; + private Filter filter = null; + protected Class getDirectorySelectClass() { return pushAbstractBaseTypeDown(directorySelectClass); } @@ -66,11 +73,25 @@ public class LuceneSearch { /** * classFilter */ - private Class clazz; + protected Class cdmTypeRestriction; - public Class getClazz() { - return clazz; + public Class getCdmTypRestriction() { + return cdmTypeRestriction; + } + + /** + * @return the filter + */ + public Filter getFilter() { + return filter; + } + + /** + * @param filter the filter to set + */ + public void setFilter(Filter filter) { + this.filter = filter; } /** @@ -78,7 +99,7 @@ public class LuceneSearch { * directorySelectClass the Class is set to null * @param clazz */ - public void setClazz(Class clazz) { + public void setCdmTypRestriction(Class clazz) { /* * NOTE: @@ -88,7 +109,7 @@ public class LuceneSearch { if(clazz != null && clazz.equals(directorySelectClass)){ clazz = null; } - this.clazz = clazz; + this.cdmTypeRestriction = clazz; } /** @@ -99,38 +120,59 @@ public class LuceneSearch { */ public final int MAX_HITS_ALLOWED = 10000; - protected Query query; + protected BooleanQuery query; protected String[] highlightFields = new String[0]; + private int maxDocsPerGroup = 10; + + + public int getMaxDocsPerGroup() { + return maxDocsPerGroup; + } + + public void setMaxDocsPerGroup(int maxDocsPerGroup) { + this.maxDocsPerGroup = maxDocsPerGroup; + } /** * @param session */ - public LuceneSearch(Session session, Class directorySelectClass) { - this.session = session; + public LuceneSearch(ILuceneIndexToolProvider toolProvider, Class directorySelectClass) { + this.toolProvider = toolProvider; this.directorySelectClass = directorySelectClass; } + /** + * @param session + */ + public LuceneSearch(ILuceneIndexToolProvider toolProvider, String groupByField, Class directorySelectClass) { + this.toolProvider = toolProvider; + this.directorySelectClass = directorySelectClass; + this.groupByField = groupByField; + } + /** * TODO the abstract base class DescriptionElementBase can not be used, so - * we are using an arbitraty subclass to find the DirectoryProvider, future + * we are using an arbitrary subclass to find the DirectoryProvider, future * versions of hibernate search my allow using abstract base classes see - * http - * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of - * -a-given-class-in-java + * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java} * * @param type must not be null * @return */ - protected Class pushAbstractBaseTypeDown(Class type) { + private Class pushAbstractBaseTypeDown(Class type) { + Class returnType = type; if (type.equals(DescriptionElementBase.class)) { - type = TextData.class; + returnType = TextData.class; } if (type.equals(TaxonBase.class)) { - type = Taxon.class; + returnType = Taxon.class; } - return type; + if (type.equals(TaxonNameBase.class)) { + returnType = NonViralName.class; + } + return returnType; } protected LuceneSearch() { @@ -140,59 +182,38 @@ public class LuceneSearch { /** * @return */ - public Searcher getSearcher() { + public IndexSearcher getSearcher() { if(searcher == null){ - searcher = new IndexSearcher(getIndexReader()); + searcher = new IndexSearcher(toolProvider.getIndexReaderFor(directorySelectClass)); +// searcher.setDefaultFieldSortScoring(true, true); } return searcher; } /** - * @return - */ - public IndexReader getIndexReader() { - SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory(); - - DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass()); - logger.info(directoryProviders[0].getDirectory().toString()); - - ReaderProvider readerProvider = searchFactory.getReaderProvider(); - IndexReader reader = readerProvider.openReader(directoryProviders[0]); - return reader; - } - - /** - * @return - */ - public QueryParser getQueryParser() { - Analyzer analyzer = getAnalyzer(); - QueryParser parser = new QueryParser("titleCache", analyzer); - return parser; - } - - /** - * @return + * Convenience method which delegated the call to the available + * {@link ILuceneIndexToolProvider#getAnalyzerFor(Class)} method. + * + * @return the Analyzer suitable for the directorySelectClass + * of the LuceneSearch */ public Analyzer getAnalyzer() { - SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory(); - Analyzer analyzer = searchFactory.getAnalyzer(getDirectorySelectClass()); - return analyzer; + return toolProvider.getAnalyzerFor(directorySelectClass); } /** * @param luceneQueryString - * @param clazz the type as additional filter criterion + * @param cdmTypeRestriction the type as additional filter criterion * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative. * @return * @throws ParseException * @throws IOException */ - public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException { + public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException { Query luceneQuery = parse(luceneQueryString); - this.query = luceneQuery; - + setQuery(luceneQuery); return executeSearch(pageSize, pageNumber); } @@ -203,20 +224,29 @@ public class LuceneSearch { */ public Query parse(String luceneQueryString) throws ParseException { logger.debug("luceneQueryString to be parsed: " + luceneQueryString); - Query luceneQuery = getQueryParser().parse(luceneQueryString); + Query luceneQuery = toolProvider.getQueryParserFor(directorySelectClass).parse(luceneQueryString); return luceneQuery; } /** - * @param luceneQuery - * @param clazz the type as additional filter criterion + * @param maxNoOfHits + * @return + * @throws IOException + */ + public TopDocs executeSearch(int maxNoOfHits) throws IOException { + Query fullQuery = expandQuery(); + logger.info("lucene query string to be parsed: " + fullQuery.toString()); + return getSearcher().search(fullQuery, filter, maxNoOfHits); + + } + /** * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative. * @return * @throws ParseException * @throws IOException */ - public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException { + public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException { if(pageNumber == null || pageNumber < 0){ @@ -228,65 +258,75 @@ public class LuceneSearch { } Query fullQuery = expandQuery(); - logger.info("final query: " + fullQuery.toString()); - int start = pageNumber * pageSize; + int offset = pageNumber * pageSize; int limit = (pageNumber + 1) * pageSize - 1 ; + logger.debug("start: " + offset + "; limit:" + limit); - logger.debug("start: " + start + "; limit:" + limit); - - TopDocs topDocs; + // sorting + Sort groupSort = null; + Sort withinGroupSort = Sort.RELEVANCE; if(sortFields != null && sortFields.length > 0){ - Sort sort = new Sort(sortFields); - topDocs = getSearcher().search(fullQuery, null, limit, sort); + groupSort = new Sort(sortFields); } else { - topDocs = getSearcher().search(fullQuery, null, limit); + groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !! } + // perform the search (needs two passes for grouping) + if(logger.isDebugEnabled()){ + logger.debug("Grouping: sortFields=" + Arrays.toString(sortFields) + ", groupByField=" + groupByField + + ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup); + } + // - first pass + TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector( + groupByField, groupSort, limit); - //TODO when switched to Lucene 3.x which is included in hibernate 4.x - // use TopDocCollector.topDocs(int start, int howMany); - // since this method might be more memory save than our own implementation - // - // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!! - // -// TopDocs topDocs = hitCollector.topDocs(); - ScoreDoc[] scoreDocs = topDocs.scoreDocs; - - int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize); - logger.debug("docsAvailableInPage:" + docsAvailableInPage); + getSearcher().search(fullQuery, filter , firstPassCollector); + Collection> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score - ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage]; - for(int i = 0; i < docsAvailableInPage; i++){ - pagedDocs[i] = scoreDocs[start + i]; + if (topGroups == null) { + return null; + } + // - flags for second pass + boolean getScores = false; + boolean getMaxScores = true; + if(groupSort.getSort()[0] != SortField.FIELD_SCORE){ + getMaxScores = false; + // see inner class TopGroupsWithMaxScore + logger.error("Fist sort field must be SortField.FIELD_SCORE otherwise the max score value will not be correct! MaxScore calculation will be skipped"); + } + boolean fillFields = true; + TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField); + TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector( + groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, + getMaxScores, fillFields + ); + getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector)); + + TopGroups groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score + + // get max score from very first result + float maxScore = groupsResult.groups[0].maxScore; + if(logger.isDebugEnabled()){ + logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset + + ", totalGroupCount=" + allGroupsCollector.getGroupCount() + + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount); } - TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore()); - // - ///////////////////////////////////////////// + TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, + offset, allGroupsCollector.getGroupCount(), maxScore); - return pagedTopDocs; + return topGroupsWithMaxScore; } /** - * @param clazz + * expands the query by adding a type restriction if the + * cdmTypeRestriction is not NULL */ protected Query expandQuery() { - Query fullQuery; - if(clazz != null){ - BooleanQuery filteredQuery = new BooleanQuery(); - BooleanQuery classFilter = new BooleanQuery(); - - Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName()); - TermQuery termQuery = new TermQuery(t); - - classFilter.setBoost(0); - classFilter.add(termQuery, BooleanClause.Occur.SHOULD); - - filteredQuery.add(this.query, BooleanClause.Occur.MUST); - filteredQuery.add(classFilter, BooleanClause.Occur.MUST); - - fullQuery = filteredQuery; + BooleanQuery fullQuery; + if(cdmTypeRestriction != null){ + fullQuery = QueryFactory.addTypeRestriction(query, cdmTypeRestriction); } else { fullQuery = this.query; } @@ -294,10 +334,15 @@ public class LuceneSearch { } public void setQuery(Query query) { - this.query = query; + if( query instanceof BooleanQuery) { + this.query = (BooleanQuery)query; + } else { + Builder builder = new Builder(); + this.query = builder.add(query, Occur.MUST).build(); + } } - public Query getQuery() { + public BooleanQuery getQuery() { return query; } @@ -316,11 +361,44 @@ public class LuceneSearch { public void setHighlightFields(String[] textFieldNamesAsArray) { this.highlightFields = textFieldNamesAsArray; - } public String[] getHighlightFields() { return this.highlightFields; } + /** + * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore. + * + * @author a.kohlbecker + * @date Oct 4, 2012 + * + */ + public class TopGroupsWithMaxScore{ + public TopGroups topGroups; + public float maxScore = Float.NaN; + + TopGroupsWithMaxScore(TopGroups topGroups, int offset, int totalGroupCount, float maxScore){ + this.maxScore = maxScore; + TopGroups newTopGroups; + if(offset > 0){ + GroupDocs[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset]; + for(int i = offset; i < topGroups.groups.length; i++){ + newGroupDocs[i - offset] = topGroups.groups[i]; + } + newTopGroups = new TopGroups( + topGroups.groupSort, + topGroups.withinGroupSort, + topGroups.totalHitCount, + topGroups.totalGroupedHitCount, + newGroupDocs, + maxScore); + } else { + newTopGroups = topGroups; + } + this.topGroups = new TopGroups(newTopGroups, totalGroupCount); + } + + } + }