cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java

   1 package eu.etaxonomy.cdm.api.service;
   2
   3 import java.util.ArrayList;
   4 import java.util.HashSet;
   5 import java.util.Iterator;
   6 import java.util.List;
   7 import java.util.Map;
   8 import java.util.Set;
   9
  10 import org.apache.commons.lang.StringUtils;
  11 import org.apache.log4j.Logger;
  12 import org.springframework.stereotype.Component;
  13
  14 import eu.etaxonomy.cdm.model.description.CategoricalData;
  15 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  16 import eu.etaxonomy.cdm.model.description.Feature;
  17 import eu.etaxonomy.cdm.model.description.FeatureNode;
  18 import eu.etaxonomy.cdm.model.description.FeatureTree;
  19 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  20 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  21 import eu.etaxonomy.cdm.model.description.TextData;
  22 import eu.etaxonomy.cdm.model.description.TextFormat;
  23 import eu.etaxonomy.cdm.model.common.Annotation;
  24 import eu.etaxonomy.cdm.model.common.AnnotationType;
  25 import eu.etaxonomy.cdm.model.common.Language;
  26
  27
  28 /**
  29  * Generator of natural language descriptions from TaxonDescriptions.
  30  *
  31  * @author m.venin
  32  * @created 13.04.2010
  33  * @version 1.0
  34  */
  35 @Component
  36 public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
  37         @SuppressWarnings("unused")
  38         private static final Logger logger = Logger.getLogger(NaturalLanguageGenerator.class);
  39
  40         private String firstSeparator = ",";
  41         private String secondSeparator = ".";
  42         private List<Integer> levels = new ArrayList<Integer>();
  43
  44         private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
  45         private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
  46
  47         private TextData previousTextData;
  48
  49         DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
  50
  51         private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
  52
  53         private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
  54
  55         /**
  56          * Change the first separator used by generateSingleTextData. By default ",".
  57          *
  58          * @param separator
  59          */
  60         public void setFirstSeparator(String separator){
  61                 firstSeparator=separator;
  62         }
  63
  64         public String getFirstSeparator(){
  65                 return firstSeparator;
  66         }
  67
  68         /**
  69          * Change the second separator used by generateSingleTextData. By default ".".
  70          *
  71          * @param separator
  72          */
  73         public void setSecondSeparator(String separator){
  74                 secondSeparator=separator;
  75         }
  76
  77         public String getSecondSeparator(){
  78                 return secondSeparator;
  79         }
  80
  81         /**
  82          * @param quantitativeDescriptionBuilder
  83          */
  84         public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
  85                 this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
  86         }
  87
  88         /**
  89          * @param categoricalDescriptionBuilder
  90          */
  91         public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
  92                 this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
  93         }
  94
  95         /**
  96          * @return the element processors of this generator
  97          */
  98         public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
  99                 return elementProcessors;
 100         }
 101
 102         /**
 103          * The keys of the elementProcessors map are regular expressions which are
 104          * being used to identify the those Descriptions to which the mapped
 105          * NaturalLanguageTextDataProcessor is applicable.
 106          *
 107          * @param elementProcessors
 108          */
 109         public void setElementProcessors(
 110                         Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
 111                 this.elementProcessors = elementProcessors;
 112         }
 113
 114         /**
 115          * Looks for technical annotations, if one matches a regular expression of the element processors
 116          * the associated processor is added to the applicable element processors which will then be applied
 117          * when generating the description.
 118          *
 119          * @param annotations the set of annotations of the description
 120          */
 121         private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
 122
 123                 if(annotations != null){
 124                         for(Annotation annotation : annotations){
 125                                 if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
 126                                         if (elementProcessors!=null){
 127                                                 for(String regex : elementProcessors.keySet()){
 128                                                         if(annotation.getText().matches(regex)){
 129                                                                 applicableElementProcessors.add(elementProcessors.get(regex));
 130                                                         }
 131                                                 }
 132                                         }
 133                                 }
 134                         }
 135                 }
 136         }
 137
 138
 139         /**
 140          * Applies the list of applicable processors to a TextData.
 141          *
 142          * @param textData the TextData to be modified
 143          * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
 144          */
 145         private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
 146                 for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
 147                         processor.process(textData, previousTextData);
 148                 }
 149         }
 150
 151
 152         /**
 153          * The most simple function to generate a description. The language used is the default one.
 154          *
 155          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 156          * @param description the TaxonDescription with all the data
 157          *
 158          * @return a list of TextData, each one being a basic element of the natural language description
 159          */
 160         public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
 161                 return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
 162         }
 163
 164
 165
 166         /**
 167          * Generate a description in a specified language.
 168          *
 169          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 170          * @param description the TaxonDescription with all the data
 171          * @param language the language in which the description has to be printed
 172          *
 173          * @return a list of TextData, each one being a basic element of the natural language description
 174          */
 175         public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
 176                 List<Language> languages = new ArrayList<Language>();
 177                 languages.add(language);
 178                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 179                 return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
 180         }
 181
 182         /**
 183          * Generate a description with a specified list of preferred languages.
 184          *
 185          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 186          * @param description the TaxonDescription with all the data
 187          * @param languages the ordered list of languages preferred for printing the description
 188          *
 189          * @return a list of TextData, each one being a basic element of the natural language description
 190          */
 191         public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
 192                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 193                 return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
 194         }
 195
 196         /**
 197          * Generate a description as a single paragraph in a TextData.
 198          *
 199          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 200          * @param description the TaxonDescription with all the data
 201          *
 202          * @return a TextData in the default language.
 203          */
 204         public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description) {
 205                 return generateSingleTextData(featureTree,description,Language.DEFAULT());
 206         }
 207
 208         /**
 209          * Generate a description as a single paragraph in a TextData.
 210          *
 211          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 212          * @param description the TaxonDescription with all the data
 213          * @param language the language in which the description has to be printed
 214          *
 215          * @return a TextData in the specified language.
 216          */
 217         public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description, Language language) {
 218                 List<Language> languages = new ArrayList<Language>();
 219                 languages.add(language);
 220                 return generatePreferredSingleTextData(featureTree,description,languages);
 221         }
 222
 223         /**
 224          * Generate a description with a specified list of preferred languages.
 225          *
 226          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 227          * @param description the TaxonDescription with all the data
 228          * @param languages the ordered list of languages preferred for printing the description
 229          *
 230          * @return a TextData using the languages (in the given order of preference)
 231          */
 232         public TextData generatePreferredSingleTextData(FeatureTree featureTree, TaxonDescription description, List<Language> languages) {
 233                 levels.clear(); // before the start, the table containing the levels of each node must be cleared
 234                 // Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
 235                 List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
 236
 237                 StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
 238                 int i = 0,j,level; // i is used to store the index of the TextData to use
 239                 boolean startSentence = false, firstOne = true;
 240
 241                 for (j=0 ; j<levels.size() ; j++){
 242                         level = levels.get(j);
 243                         if (level==-1){
 244                                 if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
 245                                         descriptionStringBuilder.append(secondSeparator + " ");
 246                                         startSentence=true;
 247                                         firstOne=false;
 248                                         String asString = texts.get(i).getText(Language.DEFAULT()).toString();
 249                                         if (asString.length()>1) descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
 250                                 }
 251                                 i++;
 252                         }
 253                         else if (level==0) { // if this node is a leaf
 254                                 if (startSentence) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
 255                                 else descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
 256                                 startSentence=false;
 257                                 i++;
 258                         }
 259                         else {
 260                                 if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
 261                                         if (i<texts.size()) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
 262                                         i++;
 263                                 }
 264                         }
 265                 }
 266                 descriptionStringBuilder.append(secondSeparator);
 267                 String returnString = descriptionStringBuilder.toString();
 268                 returnString = StringUtils.replace(returnString, "  ", " ");
 269                 returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
 270                 return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
 271         }
 272
 273
 274
 275         /** recursive function that goes through a tree containing the order in which the description has to be generated,
 276          *  if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
 277          *
 278          * @param children the children of the feature node considered
 279          * @param parent the feature node considered
 280          * @param description the TaxonDescription element for which we want a natural language output
 281          * @param language The language in which the description has to be written
 282          * @param floor integer to keep track of the level in the tree
 283          * @return a list of TextData elements containing the part of description corresponding to the feature node considered
 284          */
 285         private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages, int floor) {
 286                 List<TextData> listTextData = new ArrayList<TextData>();
 287                 floor++; // counter to know the current level in the tree
 288
 289                 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
 290                         levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
 291                         Feature feature = parent.getFeature();
 292                         TextData featureName;
 293                         if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
 294                                 featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 295                                 levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
 296                                 listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
 297                         }
 298                         else featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
 299
 300                         for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
 301                                 previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
 302                                 FeatureNode fn = ifn.next();
 303                                 listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
 304                         }
 305                 }
 306                 else { //once a leaf is reached
 307                         Feature feature = parent.getFeature();
 308                         if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
 309                                 Set<DescriptionElementBase> elements = description.getElements();
 310                                 for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
 311                                         DescriptionElementBase descriptionElement = deb.next();
 312                                         if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
 313                                                 if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
 314                                                         TextData featureTextData;
 315                                                         TextData statesTextData;
 316                                                         if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
 317                                                                 CategoricalData categoricalData = (CategoricalData) descriptionElement;
 318                                                                 statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
 319                                                                 featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 320                                                         }
 321                                                         else { // if this description is a QuantitativeData, generate the according TextData
 322                                                                 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
 323                                                                 statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
 324                                                                 featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
 325                                                         }
 326                                                         applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
 327                                                         levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
 328                                                         listTextData.add(featureTextData);
 329                                                         levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
 330                                                         listTextData.add(statesTextData);
 331                                                 }
 332                                         }
 333                                 }
 334                         }
 335                 }
 336                 return listTextData;
 337         }
 338
 339 }