cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/SearchResultHighligther.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2012 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12
  13 import java.io.IOException;
  14 import java.util.HashMap;
  15 import java.util.Map;
  16
  17 import org.apache.commons.lang.StringUtils;
  18 import org.apache.log4j.Logger;
  19 import org.apache.lucene.analysis.Analyzer;
  20 import org.apache.lucene.analysis.CachingTokenFilter;
  21 import org.apache.lucene.analysis.TokenStream;
  22 import org.apache.lucene.document.Document;
  23 import org.apache.lucene.index.CorruptIndexException;
  24 import org.apache.lucene.index.TermPositionVector;
  25 import org.apache.lucene.search.Query;
  26 import org.apache.lucene.search.Searcher;
  27 import org.apache.lucene.search.highlight.Fragmenter;
  28 import org.apache.lucene.search.highlight.Highlighter;
  29 import org.apache.lucene.search.highlight.QueryScorer;
  30 import org.apache.lucene.search.highlight.Scorer;
  31 import org.apache.lucene.search.highlight.SimpleFragmenter;
  32 import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
  33 import org.apache.lucene.search.highlight.SpanScorer;
  34 import org.apache.lucene.search.highlight.TokenSources;
  35
  36 /**
  37  * This SearchResultHighligther is using the QueryScorer by default even if the SpanScorer is meant to be the new default scorer in Lucene,
  38  * see https://issues.apache.org/jira/browse/LUCENE-1685 and https://issues.apache.org/jira/browse/LUCENE-2013.
  39  * The SpanScorer was causing problems with phrase queries (see https://dev.e-taxonomy.eu/trac/ticket/2961)
  40  * whereas the QueryScorer was returning good results.
  41  * <p>
  42  * This SearchResultHighligther can be switched to use the SpanScorer: {@link #setUseSpanScorer(boolean)}
  43  * <p>
  44  * Based on work of Nicholas Hrycan
  45  * see http://code.google.com/p/hrycan-blog/source/browse/trunk/lucene-highlight/src/com/hrycan/search/HighlighterUtil.java
  46  *
  47  *
  48  * @author Andreas Kohlbecker
  49  *
  50  */
  51 public class SearchResultHighligther {
  52
  53     public static final Logger logger = Logger.getLogger(SearchResultHighligther.class);
  54
  55     private boolean useSpanScorer = true;
  56
  57     public boolean isUseSpanScorer() {
  58         return useSpanScorer;
  59     }
  60
  61     public void setUseSpanScorer(boolean useSpanScorer) {
  62         this.useSpanScorer = useSpanScorer;
  63     }
  64
  65     public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames,  Document doc,  int fragmentNumber, int fragmentSize){
  66
  67         Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
  68         String[] values;
  69         String fieldContents;
  70         String[] fragments;
  71
  72         try {
  73             for(String fieldName : fieldNames){
  74                 values = doc.getValues(fieldName);
  75                 if(values.length == 0){
  76                     continue;
  77                 }
  78                 fieldContents = StringUtils.join(values, ' ');
  79                 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  80                 fieldHighlightMap.put(fieldName, fragments);
  81             }
  82         } catch (CorruptIndexException e) {
  83             logger.error("Error on retrieving highlighted fragments", e);
  84             e.printStackTrace();
  85         } catch (IOException e) {
  86             logger.error("Error on retrieving highlighted fragments", e);
  87         }
  88
  89         return fieldHighlightMap;
  90     }
  91
  92     /**
  93      * Generates contextual fragments.  Assumes term vectors not stored in the index.
  94      * @param analyzer - analyzer used for both indexing and searching
  95      * @param query - query object created from user's input
  96      * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
  97      * @param fieldContents - contents of fieldName
  98      * @param fragmentNumber - max number of sentence fragments to return
  99      * @param fragmentSize - the max number of characters for each fragment
 100      * @return
 101      * @throws IOException
 102      */
 103     public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
 104                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
 105
 106             TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
 107             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
 108
 109             return fragments;
 110     }
 111
 112
 113     /**
 114      * Generates contextual fragments.
 115      * @param termPosVector - Term Position Vector for fieldName
 116      * @param query - query object created from user's input
 117      * @param fieldName - name of the field containing the text to be fragmented
 118      * @param fieldContents - contents of fieldName
 119      * @param fragmentNumber - max number of sentence fragments to return
 120      * @param fragmentSize - the max number of characters for each fragment
 121      * @return
 122      * @return
 123      * @throws IOException
 124      */
 125     public String[] getFragmentsWithHighlightedTerms(TermPositionVector termPosVector, Query query,
 126                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException  {
 127
 128             TokenStream stream = TokenSources.getTokenStream(termPosVector);
 129             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
 130
 131             return fragments;
 132     }
 133
 134     /**
 135      * @param stream
 136      * @param query - query object created from user's input
 137      * @param fieldName - name of the field containing the text to be fragmented
 138      * @param fieldContents - contents of fieldName
 139      * @param fragmentNumber - max number of sentence fragments to return
 140      * @param fragmentSize - the max number of characters for each fragment
 141      * @return
 142      * @throws IOException
 143      */
 144     private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
 145             int fragmentSize) throws IOException {
 146
 147         Fragmenter fragmenter;
 148         Scorer scorer;
 149         if(useSpanScorer){
 150             scorer = new QueryScorer(query, fieldName);
 151             fragmenter = new SimpleFragmenter(fragmentSize);
 152         } else {
 153             scorer = new SpanScorer(query, fieldName, new CachingTokenFilter(stream));
 154             fragmenter = new SimpleSpanFragmenter((SpanScorer)scorer, fragmentSize);
 155         }
 156
 157         Highlighter highlighter = new Highlighter(scorer);
 158         highlighter.setTextFragmenter(fragmenter);
 159         highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
 160
 161         String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
 162         return fragments;
 163     }
 164
 165 }
 166