cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/SearchResultHighligther.java

   1 /**
   2 * Copyright (C) 2012 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9 package eu.etaxonomy.cdm.api.service.search;
  10
  11
  12 import java.io.IOException;
  13 import java.util.HashMap;
  14 import java.util.Map;
  15
  16 import org.apache.commons.lang.StringUtils;
  17 import org.apache.log4j.Logger;
  18 import org.apache.lucene.analysis.Analyzer;
  19 import org.apache.lucene.analysis.TokenStream;
  20 import org.apache.lucene.document.Document;
  21 import org.apache.lucene.index.CorruptIndexException;
  22 import org.apache.lucene.index.Terms;
  23 import org.apache.lucene.search.Query;
  24 import org.apache.lucene.search.highlight.Fragmenter;
  25 import org.apache.lucene.search.highlight.Highlighter;
  26 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
  27 import org.apache.lucene.search.highlight.QueryScorer;
  28 import org.apache.lucene.search.highlight.Scorer;
  29 import org.apache.lucene.search.highlight.SimpleFragmenter;
  30 import org.apache.lucene.search.highlight.TokenStreamFromTermVector;
  31
  32 /**
  33  *
  34  * @author Andreas Kohlbecker
  35  *
  36  */
  37 public class SearchResultHighligther {
  38
  39     public static final Logger logger = Logger.getLogger(SearchResultHighligther.class);
  40
  41     public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames,  Document doc,  int fragmentNumber, int fragmentSize){
  42
  43         Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
  44         String[] values;
  45         String fieldContents;
  46         String[] fragments;
  47
  48         try {
  49             for(String fieldName : fieldNames){
  50                 values = doc.getValues(fieldName);
  51                 if(values.length == 0){
  52                     continue;
  53                 }
  54                 fieldContents = StringUtils.join(values, ' ');
  55                 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  56                 fieldHighlightMap.put(fieldName, fragments);
  57             }
  58         } catch (CorruptIndexException e) {
  59             logger.error("Error on retrieving highlighted fragments", e);
  60             e.printStackTrace();
  61         } catch (IOException e) {
  62             logger.error("Error on retrieving highlighted fragments", e);
  63         }
  64
  65         return fieldHighlightMap;
  66     }
  67
  68     /**
  69      * Generates contextual fragments.  Assumes term vectors not stored in the index.
  70      * @param analyzer - analyzer used for both indexing and searching
  71      * @param query - query object created from user's input
  72      * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
  73      * @param fieldContents - contents of fieldName
  74      * @param fragmentNumber - max number of sentence fragments to return
  75      * @param fragmentSize - the max number of characters for each fragment
  76      * @return
  77      * @throws IOException
  78      */
  79     public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
  80                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
  81
  82             TokenStream stream = analyzer.tokenStream(fieldName, fieldContents);
  83             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  84
  85             return fragments;
  86     }
  87
  88
  89     /**
  90      * Generates contextual fragments.
  91      * @param terms - Terms obtained from the index reader by e.g.: <code>Terms terms = ir.getTermVector(docID, "text");</code>
  92      * @param query - query object created from user's input
  93      * @param fieldName - name of the field containing the text to be fragmented
  94      * @param fieldContents - contents of fieldName
  95      * @param fragmentNumber - max number of sentence fragments to return
  96      * @param fragmentSize - the max number of characters for each fragment
  97      * @return
  98      * @return
  99      * @throws IOException
 100      */
 101
 102     public String[] getFragmentsWithHighlightedTerms(Terms terms, Query query,
 103                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException  {
 104
 105             // ---- snipped
 106            // from within deprecated method org.apache.lucene.search.highlight.TokenSources.getTokenStream(Terms tpv)
 107             if (!terms.hasOffsets()) {
 108                 throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
 109                 //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
 110                 // highlighters require offsets, so we insist here.
 111             }
 112
 113             TokenStream stream = new TokenStreamFromTermVector(terms, -1);
 114             // --- snap END
 115
 116             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
 117
 118             return fragments;
 119     }
 120
 121
 122     /**
 123      * @param stream
 124      * @param query - query object created from user's input
 125      * @param fieldName - name of the field containing the text to be fragmented
 126      * @param fieldContents - contents of fieldName
 127      * @param fragmentNumber - max number of sentence fragments to return
 128      * @param fragmentSize - the max number of characters for each fragment
 129      * @return
 130      * @throws IOException
 131      */
 132     private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
 133             int fragmentSize) throws IOException {
 134
 135
 136         Scorer scorer = new QueryScorer(query, fieldName);
 137         Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
 138         Highlighter highlighter = new Highlighter(scorer);
 139
 140         highlighter.setTextFragmenter(fragmenter);
 141         highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
 142
 143         String[] fragments = null;
 144         try {
 145             fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
 146         } catch (InvalidTokenOffsetsException e) {
 147             //should never happen
 148             logger.error("InvalidTokenOffsetsException", e);
 149         }
 150         return fragments;
 151     }
 152
 153 }
 154