cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java

   1 package eu.etaxonomy.cdm.api.service;
   2
   3 import java.util.ArrayList;
   4 import java.util.HashSet;
   5 import java.util.Iterator;
   6 import java.util.List;
   7 import java.util.Map;
   8 import java.util.Set;
   9
  10 import org.apache.commons.lang.StringUtils;
  11 import org.springframework.stereotype.Component;
  12
  13 import eu.etaxonomy.cdm.model.description.CategoricalData;
  14 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  15 import eu.etaxonomy.cdm.model.description.Feature;
  16 import eu.etaxonomy.cdm.model.description.FeatureNode;
  17 import eu.etaxonomy.cdm.model.description.FeatureTree;
  18 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  19 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  20 import eu.etaxonomy.cdm.model.description.TextData;
  21 import eu.etaxonomy.cdm.model.description.TextFormat;
  22 import eu.etaxonomy.cdm.model.common.Annotation;
  23 import eu.etaxonomy.cdm.model.common.AnnotationType;
  24 import eu.etaxonomy.cdm.model.common.Language;
  25
  26
  27 /**
  28  * Generator of natural language descriptions from TaxonDescriptions.
  29  *
  30  * @author m.venin
  31  * @created 13.04.2010
  32  * @version 1.0
  33  */
  34 @Component
  35 public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
  36
  37         private String firstSeparator = ",";
  38         private String secondSeparator = ".";
  39         private List<Integer> levels = new ArrayList<Integer>();
  40
  41         private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
  42         private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
  43
  44         private TextData previousTextData;
  45
  46         private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
  47
  48         private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
  49
  50         /**
  51          * Change the first separator used by generateSingleTextData. By default ",".
  52          *
  53          * @param separator
  54          */
  55         public void setFirstSeparator(String separator){
  56                 firstSeparator=separator;
  57         }
  58
  59         public String getFirstSeparator(){
  60                 return firstSeparator;
  61         }
  62
  63         /**
  64          * Change the second separator used by generateSingleTextData. By default ".".
  65          *
  66          * @param separator
  67          */
  68         public void setSecondSeparator(String separator){
  69                 secondSeparator=separator;
  70         }
  71
  72         public String getSecondSeparator(){
  73                 return secondSeparator;
  74         }
  75
  76         /**
  77          * @param quantitativeDescriptionBuilder
  78          */
  79         public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
  80                 this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
  81         }
  82
  83         /**
  84          * @param categoricalDescriptionBuilder
  85          */
  86         public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
  87                 this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
  88         }
  89
  90         /**
  91          * @return the element processors of this generator
  92          */
  93         public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
  94                 return elementProcessors;
  95         }
  96
  97         /**
  98          * The keys of the elementProcessors map are regular expressions which are
  99          * being used to identify the those Descriptions to which the mapped
 100          * NaturalLanguageTextDataProcessor is applicable.
 101          *
 102          * @param elementProcessors
 103          */
 104         public void setElementProcessors(
 105                         Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
 106                 this.elementProcessors = elementProcessors;
 107         }
 108
 109         /**
 110          * Looks for technical annotations, if one matches a regular expression of the element processors
 111          * the associated processor is added to the applicable element processors which will then be applied
 112          * when generating the description.
 113          *
 114          * @param annotations the set of annotations of the description
 115          */
 116         private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
 117
 118                 if(annotations != null){
 119                         for(Annotation annotation : annotations){
 120                                 if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
 121                                         if (elementProcessors!=null){
 122                                         for(String regex : elementProcessors.keySet()){
 123                                                 if(annotation.getText().matches(regex)){
 124                                                         applicableElementProcessors.add(elementProcessors.get(regex));
 125                                                 }
 126                                         }
 127                                 }
 128                                         }
 129                         }
 130                 }
 131         }
 132
 133
 134         /**
 135          * Applies the list of applicable processors to a TextData.
 136          *
 137          * @param textData the TextData to be modified
 138          * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
 139          */
 140         private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
 141                 for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
 142                         processor.process(textData, previousTextData);
 143                 }
 144         }
 145
 146
 147         /**
 148          * The most simple function to generate a description. The language used is the default one.
 149          *
 150          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 151          * @param description the TaxonDescription with all the data
 152          *
 153          * @return a list of TextData, each one being a basic element of the natural language description
 154          */
 155         public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
 156                 return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
 157         }
 158
 159
 160
 161         /**
 162          * Generate a description in a specified language.
 163          *
 164          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 165          * @param description the TaxonDescription with all the data
 166          * @param language the language in which the description has to be printed
 167          *
 168          * @return a list of TextData, each one being a basic element of the natural language description
 169          */
 170         public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
 171                 List<Language> languages = new ArrayList<Language>();
 172                 languages.add(language);
 173                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 174                 return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
 175         }
 176
 177         /**
 178          * Generate a description with a specified list of preferred languages.
 179          *
 180          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 181          * @param description the TaxonDescription with all the data
 182          * @param languages the ordered list of languages preferred for printing the description
 183          *
 184          * @return a list of TextData, each one being a basic element of the natural language description
 185          */
 186         public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
 187                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 188                 return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
 189         }
 190
 191         /**
 192          * Generate a description as a single paragraph in a TextData.
 193          *
 194          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 195          * @param description the TaxonDescription with all the data
 196          *
 197          * @return a TextData in the default language.
 198          */
 199         public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description) {
 200                 return generateSingleTextData(featureTree,description,Language.DEFAULT());
 201         }
 202
 203         /**
 204          * Generate a description as a single paragraph in a TextData.
 205          *
 206          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 207          * @param description the TaxonDescription with all the data
 208          * @param language the language in which the description has to be printed
 209          *
 210          * @return a TextData in the specified language.
 211          */
 212         public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description, Language language) {
 213                 List<Language> languages = new ArrayList<Language>();
 214                 languages.add(language);
 215                 return generatePreferredSingleTextData(featureTree,description,languages);
 216         }
 217
 218         /**
 219          * Generate a description with a specified list of preferred languages.
 220          *
 221          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 222          * @param description the TaxonDescription with all the data
 223          * @param languages the ordered list of languages preferred for printing the description
 224          *
 225          * @return a TextData using the languages (in the given order of preference)
 226          */
 227         public TextData generatePreferredSingleTextData(FeatureTree featureTree, TaxonDescription description, List<Language> languages) {
 228
 229                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 230
 231                 List<TextData> texts = buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
 232                 StringBuilder sb = new StringBuilder();
 233                 int i = 0,j;
 234                 boolean startSentence = false, firstOne = true;
 235                 Integer level;
 236                 levels.clear();
 237                 for (j=0 ; j<levels.size() ; j++){
 238                         level = levels.get(j);
 239                         if (level.equals(-1)){
 240                                 if ((j+1)<levels.size() && levels.get(j+1).equals(0)){
 241                                 if (!firstOne) sb.append(secondSeparator + " ");
 242                                 startSentence=true;
 243                                 firstOne=false;
 244                                 String asString = texts.get(i).getText(Language.DEFAULT()).toString();
 245                                 if (asString.length()>1) sb.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
 246                         }
 247                                 i++;
 248                         }
 249                         else if (level.equals(0)) {
 250                                 if (startSentence) sb.append(texts.get(i).getText(Language.DEFAULT()));
 251                                 else sb.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
 252                                 startSentence=false;
 253                                 i++;
 254                         }
 255                         else if (!level.equals(0) && !level.equals(-1)){
 256                                 if (!firstOne && levels.get(j-1).equals(0)){
 257                                         if (i<texts.size()) sb.append(texts.get(i).getText(Language.DEFAULT()));
 258                                         i++;
 259                                 }
 260                         }
 261                 }
 262                 sb.append(secondSeparator);
 263                 String returnString = sb.toString();
 264                 returnString = StringUtils.replace(returnString, "  ", " ");
 265                 returnString = StringUtils.removeStart(returnString, secondSeparator);
 266                 return TextData.NewInstance(StringUtils.replace(sb.toString(), "  ", " "),Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
 267         }
 268
 269
 270
 271         /** recursive function that goes through a tree containing the order in which the description has to be generated,
 272          *  if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
 273          *
 274          * @param children the children of the feature node considered
 275          * @param parent the feature node considered
 276          * @param description the TaxonDescription element for which we want a natural language output
 277          * @param language The language in which the description has to be written
 278          * @return a list of TextData elements containing the part of description corresponding to the feature node considered
 279          */
 280         private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages, int floor) {
 281                 List<TextData> listTextData = new ArrayList<TextData>();
 282                 floor++; // counter to know the current level in the tree
 283
 284                 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
 285                         levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
 286                         Feature feature = parent.getFeature();
 287                         TextData featureName;
 288                         if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
 289                                 featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 290                                 levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
 291                                 listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
 292                         }
 293                         else featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
 294
 295                         for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
 296                                 previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
 297                                 FeatureNode fn = ifn.next();
 298                                 listTextData.addAll(buildBranchesDescr(fn.getChildren(),fn,description, languages, floor));
 299                         }
 300                 }
 301                 else { //once a leaf is reached
 302                         Feature feature = parent.getFeature();
 303                         if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
 304                                         Set<DescriptionElementBase> elements = description.getElements();
 305                                         for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
 306                                                 DescriptionElementBase descriptionElement = deb.next();
 307                                                 if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
 308                                                         if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
 309                                                                 TextData featureTextData;
 310                                                                 TextData statesTextData;
 311                                                         if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
 312                                                                 CategoricalData categoricalData = (CategoricalData) descriptionElement;
 313                                                                 statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
 314                                                                 featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 315                                                         }
 316                                                         else { // if this description is a QuantitativeData, generate the according TextData
 317                                                                 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
 318                                                                 statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
 319                                                                 featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
 320                                                         }
 321                                                         applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
 322                                                         levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
 323                                                         listTextData.add(featureTextData);
 324                                                         levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
 325                                                         listTextData.add(statesTextData);
 326                                                         }
 327                                                 }
 328                                         }
 329                         }
 330                 }
 331                 return listTextData;
 332         }
 333
 334 }