taking over class from IdentificationKeyGenerator2:

[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / NaturalLanguageGenerator.java
diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java

index 3aa30dc5acc28bf6eab96479342324bf4859b695..38b99d1346ad68a00e0d9924353cb4837ba40a49 100644 (file)
--- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java
+++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java
@@ -1,8 +1,10 @@
  package eu.etaxonomy.cdm.api.service;
  
  import java.util.ArrayList;
+import java.util.HashSet;
  import java.util.Iterator;
  import java.util.List;
+import java.util.Map;
  import java.util.Set;
  
  import org.apache.commons.lang.StringUtils;
@@ -16,188 +18,317 @@ import eu.etaxonomy.cdm.model.description.FeatureTree;
  import eu.etaxonomy.cdm.model.description.QuantitativeData;
  import eu.etaxonomy.cdm.model.description.TaxonDescription;
  import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.description.TextFormat;
+import eu.etaxonomy.cdm.model.common.Annotation;
+import eu.etaxonomy.cdm.model.common.AnnotationType;
  import eu.etaxonomy.cdm.model.common.Language;
  
+
+/**
+ * Generator of natural language descriptions from TaxonDescriptions.
+ * 
+ * @author m.venin
+ * @created 13.04.2010
+ * @version 1.0
+ */
  @Component
  public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
  
+       private String firstSeparator = ",";
+       private String secondSeparator = ".";
+       private List<Integer> levels = new ArrayList<Integer>();
+
         private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
         private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
-       
-       private String previousFeatureName;
-       
+
+       private TextData previousTextData;
+
+       private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
+
+       private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
+
         /**
+        * Change the first separator used by generateSingleTextData. By default ",".
          * 
+        * @param separator
          */
-       public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
-               List<Language> languages = new ArrayList<Language>();
-               languages.add(Language.DEFAULT());
-               return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages);
+       public void setFirstSeparator(String separator){
+               firstSeparator=separator;
         }
-       
-       
+
+       public String getFirstSeparator(){
+               return firstSeparator;
+       }
+
         /**
+        * Change the second separator used by generateSingleTextData. By default ".".
          * 
+        * @param separator
          */
-       public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
-               return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages);
+       public void setSecondSeparator(String separator){
+               secondSeparator=separator;
         }
-       
-       
-       public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
-               List<Language> languages = new ArrayList<Language>();
-               languages.add(language);
-               return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages);
+
+       public String getSecondSeparator(){
+               return secondSeparator;
         }
-       
-       /** recursive function that goes through a tree containing the order in which the description has to be generated,
-        *  if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
+
+       /**
+        * @param quantitativeDescriptionBuilder
+        */
+       public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
+               this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
+       }
+
+       /**
+        * @param categoricalDescriptionBuilder
+        */
+       public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
+               this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
+       }
+
+       /**
+        * @return the element processors of this generator
+        */
+       public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
+               return elementProcessors;
+       }
+
+       /**
+        * The keys of the elementProcessors map are regular expressions which are
+        * being used to identify the those Descriptions to which the mapped
+        * NaturalLanguageTextDataProcessor is applicable.
          * 
-        * @param children
-        * @param parent
-        * @param description
-        * @param language The language in which the description has to be written
-        * @return
+        * @param elementProcessors
          */
-       private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages) {
-               List<TextData> listTextData = new ArrayList<TextData>(); ;
-               if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
-                       Feature fref = parent.getFeature();
-                       for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
-                               FeatureNode fn = ifn.next();
-                               listTextData.addAll(buildBranchesDescr(fn.getChildren(),fn,description, languages));
-                       }
-               }
-               else { //once a leaf is reached
-                       Feature fref = parent.getFeature();
-                       if (fref!=null) { // needs a better algorithm
-                               int k=0;
-                                       Set<DescriptionElementBase> elements = description.getElements();
-                                       for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
-                                               DescriptionElementBase descriptionElement = deb.next();
-                                               TextData textData;
-                                               if (descriptionElement.getFeature().equals(fref)){ // if one matches the corresponding feature associated to this leaf
-                                                       if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
-                                                               CategoricalData categoricalData = (CategoricalData) descriptionElement;
-                                                               //textData = buildCategoricalDescr(categoricalData, language);
-                                                               textData = categoricalDescriptionBuilder.build(categoricalData, languages);
-                                                               //textData.putText(fref.getLabel(), Language.DEFAULT());
-                                                               TextData featureName = TextData.NewInstance(fref.getLabel(), Language.DEFAULT(), null);
-                                                               listTextData.add(featureName); // if you want to print the name of the feature (Should it be an option ?)
-                                                               listTextData.add(textData);
-                                                       }
-                                                       if (descriptionElement instanceof QuantitativeData) { // if this description is a QuantitativeData, generate the according TextData
-                                                               QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
-                                                               textData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
-                                                               TextData featureName = TextData.NewInstance(fref.getLabel(), Language.DEFAULT(), null);
-                                                               listTextData.add(featureName); // if you want to print the name of the feature
-                                                               listTextData.add(textData);
+       public void setElementProcessors(
+                       Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
+               this.elementProcessors = elementProcessors;
+       }
+
+       /**
+        * Looks for technical annotations, if one matches a regular expression of the element processors
+        * the associated processor is added to the applicable element processors which will then be applied
+        * when generating the description.
+        * 
+        * @param annotations the set of annotations of the description
+        */
+       private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
+
+               if(annotations != null){
+                       for(Annotation annotation : annotations){
+                               if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
+                                       if (elementProcessors!=null){
+                                               for(String regex : elementProcessors.keySet()){
+                                                       if(annotation.getText().matches(regex)){
+                                                               applicableElementProcessors.add(elementProcessors.get(regex));
                                                         }
                                                 }
                                         }
+                               }
                         }
                 }
-               return listTextData;
         }
  
+
+       /**
+        * Applies the list of applicable processors to a TextData.
+        * 
+        * @param textData the TextData to be modified
+        * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
+        */
+       private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
+               for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
+                       processor.process(textData, previousTextData);
+               }
+       }
+
+
         /**
+        * The most simple function to generate a description. The language used is the default one.
          * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
          * 
-        * @param quantitativeDescriptionBuilder
+        * @return a list of TextData, each one being a basic element of the natural language description
          */
-       public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
-               this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
+       public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
+               return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
         }
-       
+
+
+
         /**
+        * Generate a description in a specified language.
          * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
+        * @param language the language in which the description has to be printed
          * 
-        * @param categoricalDescriptionBuilder
+        * @return a list of TextData, each one being a basic element of the natural language description
          */
-       public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
-               this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
+       public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
+               List<Language> languages = new ArrayList<Language>();
+               languages.add(language);
+               initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
+               return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
+       }
+
+       /**
+        * Generate a description with a specified list of preferred languages.
+        * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
+        * @param languages the ordered list of languages preferred for printing the description
+        * 
+        * @return a list of TextData, each one being a basic element of the natural language description
+        */
+       public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
+               initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
+               return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
+       }
+
+       /**
+        * Generate a description as a single paragraph in a TextData.
+        * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
+        * 
+        * @return a TextData in the default language.
+        */
+       public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description) {
+               return generateSingleTextData(featureTree,description,Language.DEFAULT());
         }
-       
-       
-       public String generateStringNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description,   Language language) {
+
+       /**
+        * Generate a description as a single paragraph in a TextData.
+        * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
+        * @param language the language in which the description has to be printed
+        * 
+        * @return a TextData in the specified language.
+        */
+       public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description, Language language) {
                 List<Language> languages = new ArrayList<Language>();
                 languages.add(language);
-               return buildString(featureTree.getRootChildren(), featureTree.getRoot(), description, languages).toString();
+               return generatePreferredSingleTextData(featureTree,description,languages);
         }
-       
+
+       /**
+        * Generate a description with a specified list of preferred languages.
+        * 
+        * @param featureTree the FeatureTree holding the order in which features and their states must be printed
+        * @param description the TaxonDescription with all the data
+        * @param languages the ordered list of languages preferred for printing the description
+        * 
+        * @return a TextData using the languages (in the given order of preference)
+        */
+       public TextData generatePreferredSingleTextData(FeatureTree featureTree, TaxonDescription description, List<Language> languages) {
+               levels.clear(); // before the start, the table containing the levels of each node must be cleared
+               // Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
+               List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
+
+               StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
+               int i = 0,j,level; // i is used to store the index of the TextData to use
+               boolean startSentence = false, firstOne = true;
+
+               for (j=0 ; j<levels.size() ; j++){
+                       level = levels.get(j);
+                       if (level==-1){
+                               if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
+                                       descriptionStringBuilder.append(secondSeparator + " ");
+                                       startSentence=true;
+                                       firstOne=false;
+                                       String asString = texts.get(i).getText(Language.DEFAULT()).toString();
+                                       if (asString.length()>1) descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
+                               }
+                               i++;
+                       }
+                       else if (level==0) { // if this node is a leaf
+                               if (startSentence) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
+                               else descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
+                               startSentence=false;
+                               i++;
+                       }
+                       else {
+                               if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
+                                       if (i<texts.size()) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
+                                       i++;
+                               }
+                       }
+               }
+               descriptionStringBuilder.append(secondSeparator);
+               String returnString = descriptionStringBuilder.toString();
+               returnString = StringUtils.replace(returnString, "  ", " ");
+               returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
+               return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
+       }
+
+
+
         /** recursive function that goes through a tree containing the order in which the description has to be generated,
          *  if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
          * 
-        * @param children
-        * @param parent
-        * @param description
+        * @param children the children of the feature node considered
+        * @param parent the feature node considered
+        * @param description the TaxonDescription element for which we want a natural language output
          * @param language The language in which the description has to be written
-        * @return
+        * @param floor integer to keep track of the level in the tree
+        * @return a list of TextData elements containing the part of description corresponding to the feature node considered
          */
-       private StringBuilder buildString(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages) {
-               StringBuilder stringbuilder = new StringBuilder();
+       private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages, int floor) {
+               List<TextData> listTextData = new ArrayList<TextData>();
+               floor++; // counter to know the current level in the tree
+
                 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
+                       levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
+                       Feature feature = parent.getFeature();
+                       TextData featureName;
+                       if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
+                               featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
+                               levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
+                               listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
+                       }
+                       else featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
+
                         for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
+                               previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
                                 FeatureNode fn = ifn.next();
-                               StringBuilder tempsb = buildString(fn.getChildren(),fn,description, languages);
-                               if (tempsb.length()>1) stringbuilder.append(tempsb.deleteCharAt(tempsb.length()-1));
-//                             if (tempsb.length()>1) stringbuilder.append(tempsb);
+                               listTextData.addAll(buildBranchesDescr(fn.getChildren(),fn,description, languages, floor));
                         }
-                       stringbuilder.append('.');
                 }
                 else { //once a leaf is reached
-                       Feature fref = parent.getFeature();
-                       if (fref!=null) { // needs a better algorithm
-                               int k=0;
-                                       Set<DescriptionElementBase> elements = description.getElements();
-                                       for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
-                                               DescriptionElementBase descriptionElement = deb.next();
-                                               TextData textData;
-                                               if (descriptionElement.getFeature().equals(fref)){ // if one matches the corresponding feature associated to this leaf
+                       Feature feature = parent.getFeature();
+                       if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
+                               Set<DescriptionElementBase> elements = description.getElements();
+                               for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
+                                       DescriptionElementBase descriptionElement = deb.next();
+                                       if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
+                                               if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
+                                                       TextData featureTextData;
+                                                       TextData statesTextData;
                                                         if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
                                                                 CategoricalData categoricalData = (CategoricalData) descriptionElement;
-                                                               //textData = buildCategoricalDescr(categoricalData, language);
-                                                               textData = categoricalDescriptionBuilder.build(categoricalData, languages);
-                                                               //textData.putText(fref.getLabel(), Language.DEFAULT());
-                                                               String featureName = StringUtils.substringBefore(fref.getLabel(), "<");
-                                                               if (previousFeatureName==null){
-                                                                       previousFeatureName = featureName;
-                                                                       String featureString = categoricalDescriptionBuilder.buildFeature(fref,true);
-                                                                       stringbuilder.append(featureString.substring(0,1).toUpperCase() + featureString.substring(1));
-                                                               }
-                                                               else if (!featureName.contains(previousFeatureName)) {
-                                                                       stringbuilder.append(". ");
-                                                                       previousFeatureName = featureName;
-                                                                       String featureString = categoricalDescriptionBuilder.buildFeature(fref,true);
-                                                                       stringbuilder.append(featureString.substring(0,1).toUpperCase() + featureString.substring(1)); // if you want to print the name of the feature (Should it be an option ?)
-                                                               }
-                                                               stringbuilder.append(textData.getText(Language.DEFAULT()));
-                                                               stringbuilder.append(',');
+                                                               statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
+                                                               featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
                                                         }
-                                                       if (descriptionElement instanceof QuantitativeData) { // if this description is a QuantitativeData, generate the according TextData
+                                                       else { // if this description is a QuantitativeData, generate the according TextData
                                                                 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
-                                                               textData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
-                                                               String featureName = StringUtils.substringBefore(fref.getLabel(), "<");
-                                                               if (previousFeatureName==null){
-                                                                       previousFeatureName = featureName;
-                                                                       String featureString = quantitativeDescriptionBuilder.buildFeature(fref,true);
-                                                                       stringbuilder.append(featureString.substring(0,1).toUpperCase() + featureString.substring(1));
-                                                               }
-                                                               else if (!featureName.contains(previousFeatureName)) {
-                                                                       stringbuilder.append(". ");
-                                                                       previousFeatureName = featureName;
-                                                                       String featureString = quantitativeDescriptionBuilder.buildFeature(fref,true);
-                                                                       stringbuilder.append(featureString.substring(0,1).toUpperCase() + featureString.substring(1)); // if you want to print the name of the feature (Should it be an option ?)
-                                                               }
-                                                               stringbuilder.append(textData.getText(Language.DEFAULT()));
-                                                               stringbuilder.append(',');
+                                                               statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
+                                                               featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
                                                         }
+                                                       applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
+                                                       levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
+                                                       listTextData.add(featureTextData);
+                                                       levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
+                                                       listTextData.add(statesTextData);
                                                 }
                                         }
+                               }
                         }
                 }
-               return stringbuilder;
-       }
-
+               return listTextData;
+       }       
  
  }