X-Git-Url: https://dev.e-taxonomy.eu/gitweb/cdmlib.git/blobdiff_plain/fd93c8b355df3320be4afacdb9460f370d36e149..652fe14a64be7473b50577594bc38c3173b72a53:/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java index 6a93c0086b..a083460972 100644 --- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java +++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/LuceneSearch.java @@ -10,6 +10,7 @@ package eu.etaxonomy.cdm.api.service.search; import java.io.IOException; +import java.util.Collection; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; @@ -19,25 +20,31 @@ import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.search.grouping.TermAllGroupsCollector; +import org.apache.lucene.search.grouping.TermFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.TermSecondPassGroupingCollector; +import org.apache.lucene.search.grouping.TopGroups; import org.hibernate.Session; +import org.hibernate.search.ProjectionConstants; import org.hibernate.search.Search; import org.hibernate.search.SearchFactory; -import org.hibernate.search.engine.DocumentBuilder; -import org.hibernate.search.reader.ReaderProvider; -import org.hibernate.search.store.DirectoryProvider; +import eu.etaxonomy.cdm.config.Configuration; import eu.etaxonomy.cdm.model.common.CdmBase; import eu.etaxonomy.cdm.model.description.DescriptionElementBase; import eu.etaxonomy.cdm.model.description.TextData; +import eu.etaxonomy.cdm.model.name.NonViralName; +import eu.etaxonomy.cdm.model.name.TaxonNameBase; import eu.etaxonomy.cdm.model.taxon.Taxon; import eu.etaxonomy.cdm.model.taxon.TaxonBase; @@ -49,16 +56,22 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase; */ public class LuceneSearch { + protected String groupByField = "id"; + + public final static String ID_FIELD = "id"; + public static final Logger logger = Logger.getLogger(LuceneSearch.class); protected Session session; - protected Searcher searcher; + protected IndexSearcher searcher; - private SortField[] sortFields; + protected SortField[] sortFields; private Class directorySelectClass; + private Filter filter = null; + protected Class getDirectorySelectClass() { return pushAbstractBaseTypeDown(directorySelectClass); } @@ -66,13 +79,27 @@ public class LuceneSearch { /** * classFilter */ - private Class clazz; + protected Class clazz; public Class getClazz() { return clazz; } + /** + * @return the filter + */ + public Filter getFilter() { + return filter; + } + + /** + * @param filter the filter to set + */ + public void setFilter(Filter filter) { + this.filter = filter; + } + /** * Sets the Class to use as filter criterion, in case the supplied Class equals the * directorySelectClass the Class is set to null @@ -103,6 +130,16 @@ public class LuceneSearch { protected String[] highlightFields = new String[0]; + private int maxDocsPerGroup = 10; + + + public int getMaxDocsPerGroup() { + return maxDocsPerGroup; + } + + public void setMaxDocsPerGroup(int maxDocsPerGroup) { + this.maxDocsPerGroup = maxDocsPerGroup; + } /** * @param session @@ -112,13 +149,20 @@ public class LuceneSearch { this.directorySelectClass = directorySelectClass; } + /** + * @param session + */ + public LuceneSearch(Session session, String groupByField, Class directorySelectClass) { + this.session = session; + this.directorySelectClass = directorySelectClass; + this.groupByField = groupByField; + } + /** * TODO the abstract base class DescriptionElementBase can not be used, so * we are using an arbitraty subclass to find the DirectoryProvider, future * versions of hibernate search my allow using abstract base classes see - * http - * ://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of - * -a-given-class-in-java + * {@link http://stackoverflow.com/questions/492184/how-do-you-find-all-subclasses-of-a-given-class-in-java} * * @param type must not be null * @return @@ -130,6 +174,9 @@ public class LuceneSearch { if (type.equals(TaxonBase.class)) { type = Taxon.class; } + if (type.equals(TaxonNameBase.class)) { + type = NonViralName.class; + } return type; } @@ -140,9 +187,10 @@ public class LuceneSearch { /** * @return */ - public Searcher getSearcher() { + public IndexSearcher getSearcher() { if(searcher == null){ searcher = new IndexSearcher(getIndexReader()); + searcher.setDefaultFieldSortScoring(true, true); } return searcher; } @@ -152,12 +200,16 @@ public class LuceneSearch { */ public IndexReader getIndexReader() { SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory(); + IndexReader reader = searchFactory.getIndexReaderAccessor().open(getDirectorySelectClass()); + return reader; + } - DirectoryProvider[] directoryProviders = searchFactory.getDirectoryProviders(getDirectorySelectClass()); - logger.info(directoryProviders[0].getDirectory().toString()); - - ReaderProvider readerProvider = searchFactory.getReaderProvider(); - IndexReader reader = readerProvider.openReader(directoryProviders[0]); + /** + * @return + */ + public IndexReader getIndexReaderFor(Class clazz) { + SearchFactory searchFactory = Search.getFullTextSession(session).getSearchFactory(); + IndexReader reader = searchFactory.getIndexReaderAccessor().open(pushAbstractBaseTypeDown(clazz)); return reader; } @@ -166,7 +218,7 @@ public class LuceneSearch { */ public QueryParser getQueryParser() { Analyzer analyzer = getAnalyzer(); - QueryParser parser = new QueryParser("titleCache", analyzer); + QueryParser parser = new QueryParser(Configuration.luceneVersion, "titleCache", analyzer); return parser; } @@ -188,7 +240,7 @@ public class LuceneSearch { * @throws ParseException * @throws IOException */ - public TopDocs executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException { + public TopGroupsWithMaxScore executeSearch(String luceneQueryString, Integer pageSize, Integer pageNumber) throws ParseException, IOException { Query luceneQuery = parse(luceneQueryString); this.query = luceneQuery; @@ -208,15 +260,24 @@ public class LuceneSearch { } /** - * @param luceneQuery - * @param clazz the type as additional filter criterion + * @param maxNoOfHits + * @return + * @throws IOException + */ + public TopDocs executeSearch(int maxNoOfHits) throws IOException { + Query fullQuery = expandQuery(); + logger.info("lucene query string to be parsed: " + fullQuery.toString()); + return getSearcher().search(fullQuery, filter, maxNoOfHits); + + } + /** * @param pageSize if the page size is null or in an invalid range it will be set to MAX_HITS_ALLOWED * @param pageNumber a 0-based index of the page to return, will default to 0 if null or negative. * @return * @throws ParseException * @throws IOException */ - public TopDocs executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException { + public TopGroupsWithMaxScore executeSearch(Integer pageSize, Integer pageNumber) throws ParseException, IOException { if(pageNumber == null || pageNumber < 0){ @@ -228,44 +289,59 @@ public class LuceneSearch { } Query fullQuery = expandQuery(); - logger.info("final query: " + fullQuery.toString()); - int start = pageNumber * pageSize; + int offset = pageNumber * pageSize; int limit = (pageNumber + 1) * pageSize - 1 ; + logger.debug("start: " + offset + "; limit:" + limit); - logger.debug("start: " + start + "; limit:" + limit); - - TopDocs topDocs; + // sorting + Sort groupSort = null; + Sort withinGroupSort = Sort.RELEVANCE; if(sortFields != null && sortFields.length > 0){ - Sort sort = new Sort(sortFields); - topDocs = getSearcher().search(fullQuery, null, limit, sort); + if(sortFields[0] != SortField.FIELD_SCORE){ + throw new RuntimeException("Fist sort field must be SortField.FIELD_SCORE"); + } + groupSort = new Sort(sortFields); } else { - topDocs = getSearcher().search(fullQuery, null, limit); + groupSort = Sort.RELEVANCE; // == SortField.FIELD_SCORE !! } + // perform the search (needs two passes for grouping) + if(logger.isDebugEnabled()){ + logger.debug("Grouping: sortFields=" + sortFields + ", groupByField=" + groupByField + + ", groupSort=" + groupSort + ", withinGroupSort=" + withinGroupSort + ", limit=" + limit + ", maxDocsPerGroup="+ maxDocsPerGroup); + } + // - first pass + TermFirstPassGroupingCollector firstPassCollector = new TermFirstPassGroupingCollector(groupByField, withinGroupSort, limit); - //TODO when switched to Lucene 3.x which is included in hibernate 4.x - // use TopDocCollector.topDocs(int start, int howMany); - // since this method might be more memory save than our own implementation - // - // ALSO READ http://dev.e-taxonomy.eu/trac/ticket/3118 !!! - // -// TopDocs topDocs = hitCollector.topDocs(); - ScoreDoc[] scoreDocs = topDocs.scoreDocs; - - int docsAvailableInPage = Math.min(scoreDocs.length - start, pageSize); - logger.debug("docsAvailableInPage:" + docsAvailableInPage); + getSearcher().search(fullQuery, filter , firstPassCollector); + Collection> topGroups = firstPassCollector.getTopGroups(0, true); // no offset here since we need the first item for the max score - ScoreDoc[] pagedDocs = new ScoreDoc[docsAvailableInPage]; - for(int i = 0; i < docsAvailableInPage; i++){ - pagedDocs[i] = scoreDocs[start + i]; + if (topGroups == null) { + return null; + } + // - second pass + boolean getScores = true; + boolean getMaxScores = true; + boolean fillFields = true; + TermAllGroupsCollector allGroupsCollector = new TermAllGroupsCollector(groupByField); + TermSecondPassGroupingCollector secondPassCollector = new TermSecondPassGroupingCollector( + groupByField, topGroups, groupSort, withinGroupSort, maxDocsPerGroup , getScores, getMaxScores, fillFields + ); + getSearcher().search(fullQuery, filter, MultiCollector.wrap(secondPassCollector, allGroupsCollector)); + + TopGroups groupsResult = secondPassCollector.getTopGroups(0); // no offset here since we need the first item for the max score + + // get max score from very first result + float maxScore = groupsResult.groups[0].maxScore; + if(logger.isDebugEnabled()){ + logger.debug("TopGroups: maxScore=" + maxScore + ", offset=" + offset + + ", totalGroupCount=" + allGroupsCollector.getGroupCount() + ", totalGroupedHitCount=" + groupsResult.totalGroupedHitCount); } - TopDocs pagedTopDocs = new TopDocs(topDocs.totalHits, pagedDocs, topDocs.getMaxScore()); - // - ///////////////////////////////////////////// + TopGroupsWithMaxScore topGroupsWithMaxScore = new TopGroupsWithMaxScore(groupsResult, offset, allGroupsCollector.getGroupCount(), maxScore); - return pagedTopDocs; + return topGroupsWithMaxScore; } /** @@ -277,7 +353,7 @@ public class LuceneSearch { BooleanQuery filteredQuery = new BooleanQuery(); BooleanQuery classFilter = new BooleanQuery(); - Term t = new Term(DocumentBuilder.CLASS_FIELDNAME, clazz.getName()); + Term t = new Term(ProjectionConstants.OBJECT_CLASS, clazz.getName()); TermQuery termQuery = new TermQuery(t); classFilter.setBoost(0); @@ -316,11 +392,43 @@ public class LuceneSearch { public void setHighlightFields(String[] textFieldNamesAsArray) { this.highlightFields = textFieldNamesAsArray; - } public String[] getHighlightFields() { return this.highlightFields; } + /** + * may become obsolete with lucene 4.x when the TopGroups has a field for maxScore. + * + * @author a.kohlbecker + * @date Oct 4, 2012 + * + */ + public class TopGroupsWithMaxScore{ + public TopGroups topGroups; + public float maxScore = Float.NaN; + + TopGroupsWithMaxScore(TopGroups topGroups, int offset, int totalGroupCount, float maxScore){ + this.maxScore = maxScore; + TopGroups newTopGroups; + if(offset > 0){ + GroupDocs[] newGroupDocs = new GroupDocs[topGroups.groups.length - offset]; + for(int i = offset; i < topGroups.groups.length; i++){ + newGroupDocs[i - offset] = topGroups.groups[i]; + } + newTopGroups = new TopGroups( + topGroups.groupSort, + topGroups.withinGroupSort, + topGroups.totalHitCount, + topGroups.totalGroupedHitCount, + newGroupDocs); + } else { + newTopGroups = topGroups; + } + this.topGroups = new TopGroups(newTopGroups, totalGroupCount); + } + + } + }