2 * Copyright (C) 2012 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
9 package eu
.etaxonomy
.cdm
.api
.service
.search
;
12 import java
.io
.IOException
;
13 import java
.util
.HashMap
;
16 import org
.apache
.commons
.lang
.StringUtils
;
17 import org
.apache
.logging
.log4j
.LogManager
;
18 import org
.apache
.logging
.log4j
.Logger
;
19 import org
.apache
.lucene
.analysis
.Analyzer
;
20 import org
.apache
.lucene
.analysis
.TokenStream
;
21 import org
.apache
.lucene
.document
.Document
;
22 import org
.apache
.lucene
.index
.CorruptIndexException
;
23 import org
.apache
.lucene
.index
.Terms
;
24 import org
.apache
.lucene
.search
.Query
;
25 import org
.apache
.lucene
.search
.highlight
.Fragmenter
;
26 import org
.apache
.lucene
.search
.highlight
.Highlighter
;
27 import org
.apache
.lucene
.search
.highlight
.InvalidTokenOffsetsException
;
28 import org
.apache
.lucene
.search
.highlight
.QueryScorer
;
29 import org
.apache
.lucene
.search
.highlight
.Scorer
;
30 import org
.apache
.lucene
.search
.highlight
.SimpleFragmenter
;
31 import org
.apache
.lucene
.search
.highlight
.TokenStreamFromTermVector
;
34 * @author Andreas Kohlbecker
36 public class SearchResultHighligther
{
38 private static final Logger logger
= LogManager
.getLogger();
40 public Map
<String
,String
[]> getFragmentsWithHighlightedTerms(Analyzer analyzer
, Query query
, String
[] fieldNames
, Document doc
, int fragmentNumber
, int fragmentSize
){
42 Map
<String
,String
[]> fieldHighlightMap
= new HashMap
<String
, String
[]>();
48 for(String fieldName
: fieldNames
){
49 values
= doc
.getValues(fieldName
);
50 if(values
.length
== 0){
53 fieldContents
= StringUtils
.join(values
, ' ');
54 fragments
= getFragmentsWithHighlightedTerms(analyzer
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
55 fieldHighlightMap
.put(fieldName
, fragments
);
57 } catch (CorruptIndexException e
) {
58 logger
.error("Error on retrieving highlighted fragments", e
);
60 } catch (IOException e
) {
61 logger
.error("Error on retrieving highlighted fragments", e
);
64 return fieldHighlightMap
;
68 * Generates contextual fragments. Assumes term vectors not stored in the index.
69 * @param analyzer - analyzer used for both indexing and searching
70 * @param query - query object created from user's input
71 * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
72 * @param fieldContents - contents of fieldName
73 * @param fragmentNumber - max number of sentence fragments to return
74 * @param fragmentSize - the max number of characters for each fragment
78 public String
[] getFragmentsWithHighlightedTerms(Analyzer analyzer
, Query query
,
79 String fieldName
, String fieldContents
, int fragmentNumber
, int fragmentSize
) throws IOException
{
81 TokenStream stream
= analyzer
.tokenStream(fieldName
, fieldContents
);
82 String
[] fragments
= getFragmentsWithHighlightedTerms(stream
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
89 * Generates contextual fragments.
90 * @param terms - Terms obtained from the index reader by e.g.: <code>Terms terms = ir.getTermVector(docID, "text");</code>
91 * @param query - query object created from user's input
92 * @param fieldName - name of the field containing the text to be fragmented
93 * @param fieldContents - contents of fieldName
94 * @param fragmentNumber - max number of sentence fragments to return
95 * @param fragmentSize - the max number of characters for each fragment
101 public String
[] getFragmentsWithHighlightedTerms(Terms terms
, Query query
,
102 String fieldName
, String fieldContents
, int fragmentNumber
, int fragmentSize
) throws IOException
{
105 // from within deprecated method org.apache.lucene.search.highlight.TokenSources.getTokenStream(Terms tpv)
106 if (!terms
.hasOffsets()) {
107 throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
108 //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
109 // highlighters require offsets, so we insist here.
112 TokenStream stream
= new TokenStreamFromTermVector(terms
, -1);
115 String
[] fragments
= getFragmentsWithHighlightedTerms(stream
, query
, fieldName
, fieldContents
, fragmentNumber
, fragmentSize
);
123 * @param query - query object created from user's input
124 * @param fieldName - name of the field containing the text to be fragmented
125 * @param fieldContents - contents of fieldName
126 * @param fragmentNumber - max number of sentence fragments to return
127 * @param fragmentSize - the max number of characters for each fragment
129 * @throws IOException
131 private String
[] getFragmentsWithHighlightedTerms(TokenStream stream
, Query query
, String fieldName
, String fieldContents
, int fragmentNumber
,
132 int fragmentSize
) throws IOException
{
135 Scorer scorer
= new QueryScorer(query
, fieldName
);
136 Fragmenter fragmenter
= new SimpleFragmenter(fragmentSize
);
137 Highlighter highlighter
= new Highlighter(scorer
);
139 highlighter
.setTextFragmenter(fragmenter
);
140 highlighter
.setMaxDocCharsToAnalyze(Integer
.MAX_VALUE
);
142 String
[] fragments
= null;
144 fragments
= highlighter
.getBestFragments(stream
, fieldContents
, fragmentNumber
);
145 } catch (InvalidTokenOffsetsException e
) {
146 //should never happen
147 logger
.error("InvalidTokenOffsetsException", e
);