cleanup
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / SearchResultHighligther.java
1 /**
2 * Copyright (C) 2012 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.api.service.search;
10
11
12 import java.io.IOException;
13 import java.util.HashMap;
14 import java.util.Map;
15
16 import org.apache.commons.lang.StringUtils;
17 import org.apache.logging.log4j.LogManager;
18 import org.apache.logging.log4j.Logger;
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.TokenStream;
21 import org.apache.lucene.document.Document;
22 import org.apache.lucene.index.CorruptIndexException;
23 import org.apache.lucene.index.Terms;
24 import org.apache.lucene.search.Query;
25 import org.apache.lucene.search.highlight.Fragmenter;
26 import org.apache.lucene.search.highlight.Highlighter;
27 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
28 import org.apache.lucene.search.highlight.QueryScorer;
29 import org.apache.lucene.search.highlight.Scorer;
30 import org.apache.lucene.search.highlight.SimpleFragmenter;
31 import org.apache.lucene.search.highlight.TokenStreamFromTermVector;
32
33 /**
34 * @author Andreas Kohlbecker
35 */
36 public class SearchResultHighligther {
37
38 private static final Logger logger = LogManager.getLogger();
39
40 public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames, Document doc, int fragmentNumber, int fragmentSize){
41
42 Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
43 String[] values;
44 String fieldContents;
45 String[] fragments;
46
47 try {
48 for(String fieldName : fieldNames){
49 values = doc.getValues(fieldName);
50 if(values.length == 0){
51 continue;
52 }
53 fieldContents = StringUtils.join(values, ' ');
54 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
55 fieldHighlightMap.put(fieldName, fragments);
56 }
57 } catch (CorruptIndexException e) {
58 logger.error("Error on retrieving highlighted fragments", e);
59 e.printStackTrace();
60 } catch (IOException e) {
61 logger.error("Error on retrieving highlighted fragments", e);
62 }
63
64 return fieldHighlightMap;
65 }
66
67 /**
68 * Generates contextual fragments. Assumes term vectors not stored in the index.
69 * @param analyzer - analyzer used for both indexing and searching
70 * @param query - query object created from user's input
71 * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
72 * @param fieldContents - contents of fieldName
73 * @param fragmentNumber - max number of sentence fragments to return
74 * @param fragmentSize - the max number of characters for each fragment
75 * @return
76 * @throws IOException
77 */
78 public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
79 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
80
81 TokenStream stream = analyzer.tokenStream(fieldName, fieldContents);
82 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
83
84 return fragments;
85 }
86
87
88 /**
89 * Generates contextual fragments.
90 * @param terms - Terms obtained from the index reader by e.g.: <code>Terms terms = ir.getTermVector(docID, "text");</code>
91 * @param query - query object created from user's input
92 * @param fieldName - name of the field containing the text to be fragmented
93 * @param fieldContents - contents of fieldName
94 * @param fragmentNumber - max number of sentence fragments to return
95 * @param fragmentSize - the max number of characters for each fragment
96 * @return
97 * @return
98 * @throws IOException
99 */
100
101 public String[] getFragmentsWithHighlightedTerms(Terms terms, Query query,
102 String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
103
104 // ---- snipped
105 // from within deprecated method org.apache.lucene.search.highlight.TokenSources.getTokenStream(Terms tpv)
106 if (!terms.hasOffsets()) {
107 throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
108 //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
109 // highlighters require offsets, so we insist here.
110 }
111
112 TokenStream stream = new TokenStreamFromTermVector(terms, -1);
113 // --- snap END
114
115 String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
116
117 return fragments;
118 }
119
120
121 /**
122 * @param stream
123 * @param query - query object created from user's input
124 * @param fieldName - name of the field containing the text to be fragmented
125 * @param fieldContents - contents of fieldName
126 * @param fragmentNumber - max number of sentence fragments to return
127 * @param fragmentSize - the max number of characters for each fragment
128 * @return
129 * @throws IOException
130 */
131 private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
132 int fragmentSize) throws IOException {
133
134
135 Scorer scorer = new QueryScorer(query, fieldName);
136 Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
137 Highlighter highlighter = new Highlighter(scorer);
138
139 highlighter.setTextFragmenter(fragmenter);
140 highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
141
142 String[] fragments = null;
143 try {
144 fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
145 } catch (InvalidTokenOffsetsException e) {
146 //should never happen
147 logger.error("InvalidTokenOffsetsException", e);
148 }
149 return fragments;
150 }
151
152 }
153