cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/SearchResultHighligther.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2012 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12
  13 import java.io.IOException;
  14 import java.util.HashMap;
  15 import java.util.Map;
  16
  17 import org.apache.commons.lang.StringUtils;
  18 import org.apache.log4j.Logger;
  19 import org.apache.lucene.analysis.Analyzer;
  20 import org.apache.lucene.analysis.TokenStream;
  21 import org.apache.lucene.document.Document;
  22 import org.apache.lucene.index.CorruptIndexException;
  23 import org.apache.lucene.search.Query;
  24 import org.apache.lucene.search.highlight.Fragmenter;
  25 import org.apache.lucene.search.highlight.Highlighter;
  26 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
  27 import org.apache.lucene.search.highlight.QueryScorer;
  28 import org.apache.lucene.search.highlight.Scorer;
  29 import org.apache.lucene.search.highlight.SimpleFragmenter;
  30 import org.apache.lucene.search.highlight.TokenSources;
  31
  32 /**
  33  *
  34  *
  35  * @author Andreas Kohlbecker
  36  *
  37  */
  38 public class SearchResultHighligther {
  39
  40     public static final Logger logger = Logger.getLogger(SearchResultHighligther.class);
  41
  42     public Map<String,String[]> getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String[] fieldNames,  Document doc,  int fragmentNumber, int fragmentSize){
  43
  44         Map<String,String[]> fieldHighlightMap = new HashMap<String, String[]>();
  45         String[] values;
  46         String fieldContents;
  47         String[] fragments;
  48
  49         try {
  50             for(String fieldName : fieldNames){
  51                 values = doc.getValues(fieldName);
  52                 if(values.length == 0){
  53                     continue;
  54                 }
  55                 fieldContents = StringUtils.join(values, ' ');
  56                 fragments = getFragmentsWithHighlightedTerms(analyzer, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  57                 fieldHighlightMap.put(fieldName, fragments);
  58             }
  59         } catch (CorruptIndexException e) {
  60             logger.error("Error on retrieving highlighted fragments", e);
  61             e.printStackTrace();
  62         } catch (IOException e) {
  63             logger.error("Error on retrieving highlighted fragments", e);
  64         }
  65
  66         return fieldHighlightMap;
  67     }
  68
  69     /**
  70      * Generates contextual fragments.  Assumes term vectors not stored in the index.
  71      * @param analyzer - analyzer used for both indexing and searching
  72      * @param query - query object created from user's input
  73      * @param fieldName - name of the field in the lucene doc containing the text to be fragmented
  74      * @param fieldContents - contents of fieldName
  75      * @param fragmentNumber - max number of sentence fragments to return
  76      * @param fragmentSize - the max number of characters for each fragment
  77      * @return
  78      * @throws IOException
  79      */
  80     public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query,
  81                     String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException {
  82
  83             TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
  84             String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
  85
  86             return fragments;
  87     }
  88
  89
  90 //    /**
  91 //     * Generates contextual fragments.
  92 //     * @param termPosVector - Term Position Vector for fieldName
  93 //     * @param query - query object created from user's input
  94 //     * @param fieldName - name of the field containing the text to be fragmented
  95 //     * @param fieldContents - contents of fieldName
  96 //     * @param fragmentNumber - max number of sentence fragments to return
  97 //     * @param fragmentSize - the max number of characters for each fragment
  98 //     * @return
  99 //     * @return
 100 //     * @throws IOException
 101 //     */
 102 //    public String[] getFragmentsWithHighlightedTerms(TermPositionVector termPosVector, Query query,
 103 //                    String fieldName, String fieldContents, int fragmentNumber, int fragmentSize) throws IOException  {
 104 //
 105 //            TokenStream stream = TokenSources.getTokenStream(termPosVector);
 106 //            String[] fragments = getFragmentsWithHighlightedTerms(stream, query, fieldName, fieldContents, fragmentNumber, fragmentSize);
 107 //
 108 //            return fragments;
 109 //    }
 110
 111     /**
 112      * @param stream
 113      * @param query - query object created from user's input
 114      * @param fieldName - name of the field containing the text to be fragmented
 115      * @param fieldContents - contents of fieldName
 116      * @param fragmentNumber - max number of sentence fragments to return
 117      * @param fragmentSize - the max number of characters for each fragment
 118      * @return
 119      * @throws IOException
 120      */
 121     private String[] getFragmentsWithHighlightedTerms(TokenStream stream, Query query, String fieldName, String fieldContents, int fragmentNumber,
 122             int fragmentSize) throws IOException {
 123
 124
 125         Scorer scorer = new QueryScorer(query, fieldName);
 126         Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
 127         Highlighter highlighter = new Highlighter(scorer);
 128
 129         highlighter.setTextFragmenter(fragmenter);
 130         highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
 131
 132         String[] fragments = null;
 133         try {
 134             fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);
 135         } catch (InvalidTokenOffsetsException e) {
 136             //should never happen
 137             logger.error("InvalidTokenOffsetsException", e);
 138         }
 139         return fragments;
 140     }
 141
 142 }
 143