fix #6354 non phrase search with wildcard using the standart QueryParser
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / SearchResultHighligther.java
1 /**
2 * Copyright (C) 2012 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.api.service.search;
10
11
12 import java.io.IOException;
13 import java.util.HashMap;
14 import java.util.Map;
15
16 import org.apache.commons.lang.StringUtils;
17 import org.apache.log4j.Logger;
18 import org.apache.lucene.analysis.Analyzer;
19 import org.apache.lucene.analysis.TokenStream;
20 import org.apache.lucene.document.Document;
21 import org.apache.lucene.index.CorruptIndexException;
22 import org.apache.lucene.index.Terms;
23 import org.apache.lucene.search.Query;
24 import org.apache.lucene.search.highlight.Fragmenter;
25 import org.apache.lucene.search.highlight.Highlighter;
26 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
27 import org.apache.lucene.search.highlight.QueryScorer;
28 import org.apache.lucene.search.highlight.Scorer;
29 import org.apache.lucene.search.highlight.SimpleFragmenter;
30 import org.apache.lucene.search.highlight.TokenStreamFromTermVector;
31
32 /**
33 *
34 * @author Andreas Kohlbecker
35 *
36 */
37 public class SearchResultHighligther {
38
39 public static final Logger logger = Logger.getLogger(SearchResultHighligther.class);
40
41 public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames, Document doc, int fragmentNumber, int fragmentSize){
42
43 Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
44 String[] values;
45 String fieldContents;
46 String[] fragments;
47
48 try {
49 for(String fieldName : fieldNames){
50 values = doc.getValues(fieldName);
51 if(values.length == 0){
52 continue;
53 }
54 fieldContents = StringUtils.join(values, ' ');
55 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
56 fieldHighlightMap.put(fieldName, fragments);
57 }
58 } catch (CorruptIndexException e) {
59 logger.error("Error on retrieving highlighted fragments", e);
60 e.printStackTrace();
61 } catch (IOException e) {
62 logger.error("Error on retrieving highlighted fragments", e);
63 }
64
65 return fieldHighlightMap;
66 }
67
68 /**
69 * Generates contextual fragments. Assumes term vectors not stored in the index.
70 * @param analyzer - analyzer used for both indexing and searching
71 * @param query - query object created from user's input
72 * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
73 * @param fieldContents - contents of fieldName
74 * @param fragmentNumber - max number of sentence fragments to return
75 * @param fragmentSize - the max number of characters for each fragment
76 * @return
77 * @throws IOException
78 */
79 public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
80 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
81
82 TokenStream stream = analyzer.tokenStream(fieldName, fieldContents);
83 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
84
85 return fragments;
86 }
87
88
89 /**
90 * Generates contextual fragments.
91 * @param terms - Terms obtained from the index reader by e.g.: <code>Terms terms = ir.getTermVector(docID, "text");</code>
92 * @param query - query object created from user's input
93 * @param fieldName - name of the field containing the text to be fragmented
94 * @param fieldContents - contents of fieldName
95 * @param fragmentNumber - max number of sentence fragments to return
96 * @param fragmentSize - the max number of characters for each fragment
97 * @return
98 * @return
99 * @throws IOException
100 */
101
102 public String[] getFragmentsWithHighlightedTerms(Terms terms, Query query,
103 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
104
105 // ---- snipped
106 // from within deprecated method org.apache.lucene.search.highlight.TokenSources.getTokenStream(Terms tpv)
107 if (!terms.hasOffsets()) {
108 throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
109 //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
110 // highlighters require offsets, so we insist here.
111 }
112
113 TokenStream stream = new TokenStreamFromTermVector(terms, -1);
114 // --- snap END
115
116 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
117
118 return fragments;
119 }
120
121
122 /**
123 * @param stream
124 * @param query - query object created from user's input
125 * @param fieldName - name of the field containing the text to be fragmented
126 * @param fieldContents - contents of fieldName
127 * @param fragmentNumber - max number of sentence fragments to return
128 * @param fragmentSize - the max number of characters for each fragment
129 * @return
130 * @throws IOException
131 */
132 private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
133 int fragmentSize) throws IOException {
134
135
136 Scorer scorer = new QueryScorer(query, fieldName);
137 Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
138 Highlighter highlighter = new Highlighter(scorer);
139
140 highlighter.setTextFragmenter(fragmenter);
141 highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
142
143 String[] fragments = null;
144 try {
145 fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
146 } catch (InvalidTokenOffsetsException e) {
147 //should never happen
148 logger.error("InvalidTokenOffsetsException", e);
149 }
150 return fragments;
151 }
152
153 }
154