cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/SearchResultHighligther.java

   1 /**
   2 * Copyright (C) 2012 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9 package eu.etaxonomy.cdm.api.service.search;
  10
  11
  12 import java.io.IOException;
  13 import java.util.HashMap;
  14 import java.util.Map;
  15
  16 import org.apache.commons.lang.StringUtils;
  17 import org.apache.logging.log4j.LogManager;
  18 import org.apache.logging.log4j.Logger;
  19 import org.apache.lucene.analysis.Analyzer;
  20 import org.apache.lucene.analysis.TokenStream;
  21 import org.apache.lucene.document.Document;
  22 import org.apache.lucene.index.CorruptIndexException;
  23 import org.apache.lucene.index.Terms;
  24 import org.apache.lucene.search.Query;
  25 import org.apache.lucene.search.highlight.Fragmenter;
  26 import org.apache.lucene.search.highlight.Highlighter;
  27 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
  28 import org.apache.lucene.search.highlight.QueryScorer;
  29 import org.apache.lucene.search.highlight.Scorer;
  30 import org.apache.lucene.search.highlight.SimpleFragmenter;
  31 import org.apache.lucene.search.highlight.TokenStreamFromTermVector;
  32
  33 /**
  34  * @author Andreas Kohlbecker
  35  */
  36 public class SearchResultHighligther {
  37
  38     private static final Logger logger = LogManager.getLogger();
  39
  40     public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames,  Document doc,  int fragmentNumber, int fragmentSize){
  41
  42         Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
  43         String[] values;
  44         String fieldContents;
  45         String[] fragments;
  46
  47         try {
  48             for(String fieldName : fieldNames){
  49                 values = doc.getValues(fieldName);
  50                 if(values.length == 0){
  51                     continue;
  52                 }
  53                 fieldContents = StringUtils.join(values, ' ');
  54                 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  55                 fieldHighlightMap.put(fieldName, fragments);
  56             }
  57         } catch (CorruptIndexException e) {
  58             logger.error("Error on retrieving highlighted fragments", e);
  59             e.printStackTrace();
  60         } catch (IOException e) {
  61             logger.error("Error on retrieving highlighted fragments", e);
  62         }
  63
  64         return fieldHighlightMap;
  65     }
  66
  67     /**
  68      * Generates contextual fragments.  Assumes term vectors not stored in the index.
  69      * @param analyzer - analyzer used for both indexing and searching
  70      * @param query - query object created from user's input
  71      * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
  72      * @param fieldContents - contents of fieldName
  73      * @param fragmentNumber - max number of sentence fragments to return
  74      * @param fragmentSize - the max number of characters for each fragment
  75      * @return
  76      * @throws IOException
  77      */
  78     public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
  79                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
  80
  81             TokenStream stream = analyzer.tokenStream(fieldName, fieldContents);
  82             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  83
  84             return fragments;
  85     }
  86
  87
  88     /**
  89      * Generates contextual fragments.
  90      * @param terms - Terms obtained from the index reader by e.g.: <code>Terms terms = ir.getTermVector(docID, "text");</code>
  91      * @param query - query object created from user's input
  92      * @param fieldName - name of the field containing the text to be fragmented
  93      * @param fieldContents - contents of fieldName
  94      * @param fragmentNumber - max number of sentence fragments to return
  95      * @param fragmentSize - the max number of characters for each fragment
  96      * @return
  97      * @return
  98      * @throws IOException
  99      */
 100
 101     public String[] getFragmentsWithHighlightedTerms(Terms terms, Query query,
 102                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException  {
 103
 104             // ---- snipped
 105            // from within deprecated method org.apache.lucene.search.highlight.TokenSources.getTokenStream(Terms tpv)
 106             if (!terms.hasOffsets()) {
 107                 throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
 108                 //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
 109                 // highlighters require offsets, so we insist here.
 110             }
 111
 112             TokenStream stream = new TokenStreamFromTermVector(terms, -1);
 113             // --- snap END
 114
 115             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
 116
 117             return fragments;
 118     }
 119
 120
 121     /**
 122      * @param stream
 123      * @param query - query object created from user's input
 124      * @param fieldName - name of the field containing the text to be fragmented
 125      * @param fieldContents - contents of fieldName
 126      * @param fragmentNumber - max number of sentence fragments to return
 127      * @param fragmentSize - the max number of characters for each fragment
 128      * @return
 129      * @throws IOException
 130      */
 131     private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
 132             int fragmentSize) throws IOException {
 133
 134
 135         Scorer scorer = new QueryScorer(query, fieldName);
 136         Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
 137         Highlighter highlighter = new Highlighter(scorer);
 138
 139         highlighter.setTextFragmenter(fragmenter);
 140         highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
 141
 142         String[] fragments = null;
 143         try {
 144             fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
 145         } catch (InvalidTokenOffsetsException e) {
 146             //should never happen
 147             logger.error("InvalidTokenOffsetsException", e);
 148         }
 149         return fragments;
 150     }
 151
 152 }
 153