#2961 (test for highlighting multiple words as phrase FAILS)
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / SearchResultHighligther.java
1 // $Id$
2 /**
3 * Copyright (C) 2012 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12
13 import java.io.IOException;
14 import java.util.HashMap;
15 import java.util.Map;
16
17 import org.apache.commons.lang.StringUtils;
18 import org.apache.log4j.Logger;
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.CachingTokenFilter;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.document.Document;
23 import org.apache.lucene.index.CorruptIndexException;
24 import org.apache.lucene.index.TermPositionVector;
25 import org.apache.lucene.search.Query;
26 import org.apache.lucene.search.Searcher;
27 import org.apache.lucene.search.highlight.Fragmenter;
28 import org.apache.lucene.search.highlight.Highlighter;
29 import org.apache.lucene.search.highlight.QueryScorer;
30 import org.apache.lucene.search.highlight.Scorer;
31 import org.apache.lucene.search.highlight.SimpleFragmenter;
32 import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
33 import org.apache.lucene.search.highlight.SpanScorer;
34 import org.apache.lucene.search.highlight.TokenSources;
35
36 /**
37 * This SearchResultHighligther is using the QueryScorer by default even if the SpanScorer is meant to be the new default scorer in Lucene,
38 * see https://issues.apache.org/jira/browse/LUCENE-1685 and https://issues.apache.org/jira/browse/LUCENE-2013.
39 * The SpanScorer was causing problems with phrase queries (see https://dev.e-taxonomy.eu/trac/ticket/2961)
40 * whereas the QueryScorer was returning good results.
41 * <p>
42 * This SearchResultHighligther can be switched to use the SpanScorer: {@link #setUseSpanScorer(boolean)}
43 * <p>
44 * Based on work of Nicholas Hrycan
45 * see http://code.google.com/p/hrycan-blog/source/browse/trunk/lucene-highlight/src/com/hrycan/search/HighlighterUtil.java
46 *
47 *
48 * @author Andreas Kohlbecker
49 *
50 */
51 public class SearchResultHighligther {
52
53 public static final Logger logger = Logger.getLogger(SearchResultHighligther.class);
54
55 private boolean useSpanScorer = true;
56
57 public boolean isUseSpanScorer() {
58 return useSpanScorer;
59 }
60
61 public void setUseSpanScorer(boolean useSpanScorer) {
62 this.useSpanScorer = useSpanScorer;
63 }
64
65 public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames, Document doc, int fragmentNumber, int fragmentSize){
66
67 Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
68 String[] values;
69 String fieldContents;
70 String[] fragments;
71
72 try {
73 for(String fieldName : fieldNames){
74 values = doc.getValues(fieldName);
75 if(values.length == 0){
76 continue;
77 }
78 fieldContents = StringUtils.join(values, ' ');
79 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
80 fieldHighlightMap.put(fieldName, fragments);
81 }
82 } catch (CorruptIndexException e) {
83 logger.error("Error on retrieving highlighted fragments", e);
84 e.printStackTrace();
85 } catch (IOException e) {
86 logger.error("Error on retrieving highlighted fragments", e);
87 }
88
89 return fieldHighlightMap;
90 }
91
92 /**
93 * Generates contextual fragments. Assumes term vectors not stored in the index.
94 * @param analyzer - analyzer used for both indexing and searching
95 * @param query - query object created from user's input
96 * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
97 * @param fieldContents - contents of fieldName
98 * @param fragmentNumber - max number of sentence fragments to return
99 * @param fragmentSize - the max number of characters for each fragment
100 * @return
101 * @throws IOException
102 */
103 public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
104 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
105
106 TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
107 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
108
109 return fragments;
110 }
111
112
113 /**
114 * Generates contextual fragments.
115 * @param termPosVector - Term Position Vector for fieldName
116 * @param query - query object created from user's input
117 * @param fieldName - name of the field containing the text to be fragmented
118 * @param fieldContents - contents of fieldName
119 * @param fragmentNumber - max number of sentence fragments to return
120 * @param fragmentSize - the max number of characters for each fragment
121 * @return
122 * @return
123 * @throws IOException
124 */
125 public String[] getFragmentsWithHighlightedTerms(TermPositionVector termPosVector, Query query,
126 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
127
128 TokenStream stream = TokenSources.getTokenStream(termPosVector);
129 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
130
131 return fragments;
132 }
133
134 /**
135 * @param stream
136 * @param query - query object created from user's input
137 * @param fieldName - name of the field containing the text to be fragmented
138 * @param fieldContents - contents of fieldName
139 * @param fragmentNumber - max number of sentence fragments to return
140 * @param fragmentSize - the max number of characters for each fragment
141 * @return
142 * @throws IOException
143 */
144 private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
145 int fragmentSize) throws IOException {
146
147 Fragmenter fragmenter;
148 Scorer scorer;
149 if(useSpanScorer){
150 scorer = new QueryScorer(query, fieldName);
151 fragmenter = new SimpleFragmenter(fragmentSize);
152 } else {
153 scorer = new SpanScorer(query, fieldName, new CachingTokenFilter(stream));
154 fragmenter = new SimpleSpanFragmenter((SpanScorer)scorer, fragmentSize);
155 }
156
157 Highlighter highlighter = new Highlighter(scorer);
158 highlighter.setTextFragmenter(fragmenter);
159 highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
160
161 String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
162 return fragments;
163 }
164
165 }
166