3 * Copyright (C) 2012 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.api
.service
.search
;
13 import java
.io
.IOException
;
14 import java
.util
.HashMap
;
17 import org
.apache
.commons
.lang
.StringUtils
;
18 import org
.apache
.log4j
.Logger
;
19 import org
.apache
.lucene
.analysis
.Analyzer
;
20 import org
.apache
.lucene
.analysis
.CachingTokenFilter
;
21 import org
.apache
.lucene
.analysis
.TokenStream
;
22 import org
.apache
.lucene
.document
.Document
;
23 import org
.apache
.lucene
.index
.CorruptIndexException
;
24 import org
.apache
.lucene
.index
.TermPositionVector
;
25 import org
.apache
.lucene
.search
.Query
;
26 import org
.apache
.lucene
.search
.Searcher
;
27 import org
.apache
.lucene
.search
.highlight
.Fragmenter
;
28 import org
.apache
.lucene
.search
.highlight
.Highlighter
;
29 import org
.apache
.lucene
.search
.highlight
.QueryScorer
;
30 import org
.apache
.lucene
.search
.highlight
.Scorer
;
31 import org
.apache
.lucene
.search
.highlight
.SimpleFragmenter
;
32 import org
.apache
.lucene
.search
.highlight
.SimpleSpanFragmenter
;
33 import org
.apache
.lucene
.search
.highlight
.SpanScorer
;
34 import org
.apache
.lucene
.search
.highlight
.TokenSources
;
37 * This SearchResultHighligther is using the QueryScorer by default even if the SpanScorer is meant to be the new default scorer in Lucene,
38 * see https://issues.apache.org/jira/browse/LUCENE-1685 and https://issues.apache.org/jira/browse/LUCENE-2013.
39 * The SpanScorer was causing problems with phrase queries (see https://dev.e-taxonomy.eu/trac/ticket/2961)
40 * whereas the QueryScorer was returning good results.
42 * This SearchResultHighligther can be switched to use the SpanScorer: {@link #setUseSpanScorer(boolean)}
44 * Based on work of Nicholas Hrycan
45 * see http://code.google.com/p/hrycan-blog/source/browse/trunk/lucene-highlight/src/com/hrycan/search/HighlighterUtil.java
48 * @author Andreas Kohlbecker
51 public class SearchResultHighligther
{
53 public static final Logger logger
= Logger
.getLogger(SearchResultHighligther
.class);
55 private boolean useSpanScorer
= true;
57 public boolean isUseSpanScorer() {
61 public void setUseSpanScorer(boolean useSpanScorer
) {
62 this.useSpanScorer
= useSpanScorer
;
65 public Map
<String
,String
[]> getFragmentsWithHighlightedTerms(Analyzer analyzer
, Query query
, String
[] fieldNames
, Document doc
, int fragmentNumber
, int fragmentSize
){
67 Map
<String
,String
[]> fieldHighlightMap
= new HashMap
<String
, String
[]>();
73 for(String fieldName
: fieldNames
){
74 values
= doc
.getValues(fieldName
);
75 if(values
.length
== 0){
78 fieldContents
= StringUtils
.join(values
, ' ');
79 fragments
= getFragmentsWithHighlightedTerms(analyzer
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
80 fieldHighlightMap
.put(fieldName
, fragments
);
82 } catch (CorruptIndexException e
) {
83 logger
.error("Error on retrieving highlighted fragments", e
);
85 } catch (IOException e
) {
86 logger
.error("Error on retrieving highlighted fragments", e
);
89 return fieldHighlightMap
;
93 * Generates contextual fragments. Assumes term vectors not stored in the index.
94 * @param analyzer - analyzer used for both indexing and searching
95 * @param query - query object created from user's input
96 * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
97 * @param fieldContents - contents of fieldName
98 * @param fragmentNumber - max number of sentence fragments to return
99 * @param fragmentSize - the max number of characters for each fragment
101 * @throws IOException
103 public String
[] getFragmentsWithHighlightedTerms(Analyzer analyzer
, Query query
,
104 String fieldName
, String fieldContents
, int fragmentNumber
, int fragmentSize
) throws IOException
{
106 TokenStream stream
= TokenSources
.getTokenStream(fieldName
, fieldContents
, analyzer
);
107 String
[] fragments
= getFragmentsWithHighlightedTerms(stream
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
114 * Generates contextual fragments.
115 * @param termPosVector - Term Position Vector for fieldName
116 * @param query - query object created from user's input
117 * @param fieldName - name of the field containing the text to be fragmented
118 * @param fieldContents - contents of fieldName
119 * @param fragmentNumber - max number of sentence fragments to return
120 * @param fragmentSize - the max number of characters for each fragment
123 * @throws IOException
125 public String
[] getFragmentsWithHighlightedTerms(TermPositionVector termPosVector
, Query query
,
126 String fieldName
, String fieldContents
, int fragmentNumber
, int fragmentSize
) throws IOException
{
128 TokenStream stream
= TokenSources
.getTokenStream(termPosVector
);
129 String
[] fragments
= getFragmentsWithHighlightedTerms(stream
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
136 * @param query - query object created from user's input
137 * @param fieldName - name of the field containing the text to be fragmented
138 * @param fieldContents - contents of fieldName
139 * @param fragmentNumber - max number of sentence fragments to return
140 * @param fragmentSize - the max number of characters for each fragment
142 * @throws IOException
144 private String
[] getFragmentsWithHighlightedTerms(TokenStream stream
, Query query
, String fieldName
, String fieldContents
, int fragmentNumber
,
145 int fragmentSize
) throws IOException
{
147 Fragmenter fragmenter
;
150 scorer
= new QueryScorer(query
, fieldName
);
151 fragmenter
= new SimpleFragmenter(fragmentSize
);
153 scorer
= new SpanScorer(query
, fieldName
, new CachingTokenFilter(stream
));
154 fragmenter
= new SimpleSpanFragmenter((SpanScorer
)scorer
, fragmentSize
);
157 Highlighter highlighter
= new Highlighter(scorer
);
158 highlighter
.setTextFragmenter(fragmenter
);
159 highlighter
.setMaxDocCharsToAnalyze(Integer
.MAX_VALUE
);
161 String
[] fragments
= highlighter
.getBestFragments(stream
, fieldContents
, fragmentNumber
);