first implementation: existing tests all Ok, grouping does not yet really work, index...
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / SearchResultBuilder.java
1 // $Id$
2 /**
3 * Copyright (C) 2012 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Map;
18
19 import org.apache.commons.lang.ArrayUtils;
20 import org.apache.commons.lang.StringUtils;
21 import org.apache.log4j.Logger;
22 import org.apache.lucene.document.Document;
23 import org.apache.lucene.index.CorruptIndexException;
24 import org.apache.lucene.search.MultiTermQuery;
25 import org.apache.lucene.search.Query;
26 import org.apache.lucene.search.ScoreDoc;
27 import org.apache.lucene.search.TopDocs;
28 import org.apache.lucene.search.WildcardQuery;
29 import org.apache.lucene.search.grouping.GroupDocs;
30 import org.apache.lucene.search.grouping.TopGroups;
31 import org.hibernate.search.engine.DocumentBuilder;
32
33 import eu.etaxonomy.cdm.model.CdmBaseType;
34 import eu.etaxonomy.cdm.model.common.CdmBase;
35 import eu.etaxonomy.cdm.persistence.dao.common.ICdmEntityDao;
36
37 /**
38 * @author Andreas Kohlbecker
39 * @date Jan 6, 2012
40 *
41 */
42 public class SearchResultBuilder implements ISearchResultBuilder {
43
44 public static final Logger logger = Logger.getLogger(SearchResultBuilder.class);
45
46 /* (non-Javadoc)
47 * @see eu.etaxonomy.cdm.api.service.search.ISearchResultBuilder#createResultSetFromIds(eu.etaxonomy.cdm.search.LuceneSearch, org.apache.lucene.search.TopDocs, eu.etaxonomy.cdm.persistence.dao.common.ICdmEntityDao, java.lang.String)
48 */
49 private Query query;
50 /**
51 * fragmentNumber - max number of sentence fragments to return
52 */
53 private int fragmentNumber = 5;
54 /**
55 * fragmentSize - the max number of characters for each fragment
56 */
57 private int fragmentSize = 100;
58 private LuceneSearch luceneSearch;
59
60 /**
61 * Use this constructor if you do not wish to retrieve highlighted terms found in the best sections of a text.
62 * @param luceneSearch
63 */
64 public SearchResultBuilder(LuceneSearch luceneSearch){
65 this.luceneSearch = luceneSearch;
66 }
67
68 /**
69 * @param luceneSearch
70 * @param query the Query will be used to highlight matching fragments if the <code>highlightFields</code> property is supplied to
71 * {@link #createResultSet(TopDocs, String[], ICdmEntityDao, String, List)}
72 */
73 public SearchResultBuilder(LuceneSearch luceneSearch, Query query){
74 this.luceneSearch = luceneSearch;
75 this.query = query;
76 }
77
78 /**
79 * {@inheritDoc}
80 *
81 * <h3>NOTE:</h3> All {@link MultiTermQuery} like {@link WildcardQuery} are
82 * constant score by default since Lucene 2.9, you can change that back to
83 * scoring mode: <code>WildcardQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)</code>
84 * This slows down the query immense or throws TooManyClauses exceptions if
85 * too many terms match the wildcard.
86 */
87 public <T extends CdmBase> List<SearchResult<T>> createResultSet(TopGroups topGroupsResultSet,
88 String[] highlightFields, ICdmEntityDao<T> dao, Map<CdmBaseType, String> idFields, List<String> propertyPaths) throws CorruptIndexException, IOException {
89
90 List<SearchResult<T>> searchResults = new ArrayList<SearchResult<T>>();
91
92 if(topGroupsResultSet == null){
93 return searchResults;
94 }
95
96 SearchResultHighligther highlighter = null;
97 if(highlightFields != null && highlightFields.length > 0){
98 highlighter = new SearchResultHighligther();
99 }
100
101 for (GroupDocs groupDoc : topGroupsResultSet.groups) {
102
103 String cdmEntityId = null;
104 SearchResult<T> searchResult = new SearchResult<T>();
105 for(ScoreDoc scoreDoc : groupDoc.scoreDocs) {
106 //FIXME should we group on taxon id ?????
107 Document document = luceneSearch.getSearcher().doc(scoreDoc.doc);
108 searchResult.addDoc(document);
109
110 if(cdmEntityId == null){
111 // IMPORTANT: here we assume that all documents refer to the same cdm entity
112 cdmEntityId = findId(idFields, document);
113 }
114 }
115
116 // set score values
117 if(isNumber(groupDoc.maxScore)){
118 searchResult.setScore(groupDoc.maxScore);
119 }
120 //FIXME get max score
121 // if(isNumber(topGroupsResultSet.getMaxScore())){
122 // searchResult.setMaxScore(topGroupsResultSet.getMaxScore());
123 // }
124
125 //TODO use findByUuid(List<UUID> uuids, List<Criterion> criteria, List<String> propertyPaths)
126 // instead or even better a similar findById(List<Integer> ids) however this is not yet implemented
127 if(cdmEntityId != null){
128 T entity = dao.load(Integer.valueOf(cdmEntityId), propertyPaths);
129 searchResult.setEntity(entity);
130 }
131
132 // add highlight fragments
133 if(highlighter != null){
134 Map<String, String[]> fieldFragmentMap = null;
135 for(Document doc: searchResult.getDocs()){
136 fieldFragmentMap = merge(fieldFragmentMap, highlighter.getFragmentsWithHighlightedTerms(luceneSearch.getAnalyzer(), query, highlightFields, doc, fragmentNumber, fragmentSize));
137 }
138 searchResult.setFieldHighlightMap(fieldFragmentMap);
139 }
140
141 // finally add the final result to the list
142 searchResults.add(searchResult);
143 }
144
145 return searchResults;
146 }
147
148 /**
149 * @param base
150 * @param add
151 * @return
152 */
153 private Map<String, String[]> merge(Map<String, String[]> base, Map<String, String[]> add) {
154 if(base == null){
155 return add;
156 } else {
157 for(String key : add.keySet()) {
158 if (base.containsKey(key)){
159 base.put(key, (String[]) ArrayUtils.addAll(base.get(key), add.get(key)));
160 } else {
161 base.put(key, add.get(key));
162 }
163 }
164 return base;
165 }
166 }
167
168 /**
169 * find the entity id
170 *
171 * @param idFields
172 * @param doc
173 * @return
174 */
175 private String findId(Map<CdmBaseType,String> idFieldMap, Document doc) {
176
177 String docClassName = doc.getValues(DocumentBuilder.CLASS_FIELDNAME)[0];
178
179 String id = null;
180 for(CdmBaseType baseType : idFieldMap.keySet()){
181 if(baseType.getSubClassNames().contains(docClassName)){
182 String[] idStrings = doc.getValues(idFieldMap.get(baseType));
183 if(idStrings.length > 0 && StringUtils.isNotBlank(idStrings[0])){
184 id = idStrings[0];
185 break;
186 }
187 }
188 }
189 if(id == null){
190 throw new RuntimeException("No id field name given for " + docClassName);
191 }
192 return id;
193 }
194
195 /**
196 * @param number
197 * @return
198 */
199 private boolean isNumber(Float number) {
200 return !Double.isNaN(number) && !Double.isInfinite(number);
201 }
202
203 }