cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NaturalLanguageGenerator.java

   1 package eu.etaxonomy.cdm.api.service;
   2
   3 import java.util.ArrayList;
   4 import java.util.HashSet;
   5 import java.util.Iterator;
   6 import java.util.List;
   7 import java.util.Map;
   8 import java.util.Set;
   9
  10 import org.apache.commons.lang.StringUtils;
  11 import org.apache.log4j.Logger;
  12 import org.springframework.stereotype.Component;
  13
  14 import eu.etaxonomy.cdm.model.common.Annotation;
  15 import eu.etaxonomy.cdm.model.common.AnnotationType;
  16 import eu.etaxonomy.cdm.model.common.Language;
  17 import eu.etaxonomy.cdm.model.description.CategoricalData;
  18 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  19 import eu.etaxonomy.cdm.model.description.Feature;
  20 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  21 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  22 import eu.etaxonomy.cdm.model.description.TextData;
  23 import eu.etaxonomy.cdm.model.description.TextFormat;
  24 import eu.etaxonomy.cdm.model.term.TermTree;
  25 import eu.etaxonomy.cdm.model.term.TermNode;
  26
  27
  28 /**
  29  * Generator of natural language descriptions from TaxonDescriptions.
  30  *
  31  * @author m.venin
  32  * @since 13.04.2010
  33  */
  34 @Component
  35 public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
  36         @SuppressWarnings("unused")
  37         private static final Logger logger = Logger.getLogger(NaturalLanguageGenerator.class);
  38
  39         private String firstSeparator = ",";
  40         private String secondSeparator = ".";
  41         private List<Integer> levels = new ArrayList<Integer>();
  42
  43         private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
  44         private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
  45
  46         private TextData previousTextData;
  47
  48         DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
  49
  50         private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
  51
  52         private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
  53
  54         /**
  55          * Change the first separator used by generateSingleTextData. By default ",".
  56          *
  57          * @param separator
  58          */
  59         public void setFirstSeparator(String separator){
  60                 firstSeparator=separator;
  61         }
  62
  63         public String getFirstSeparator(){
  64                 return firstSeparator;
  65         }
  66
  67         /**
  68          * Change the second separator used by generateSingleTextData. By default ".".
  69          *
  70          * @param separator
  71          */
  72         public void setSecondSeparator(String separator){
  73                 secondSeparator=separator;
  74         }
  75
  76         public String getSecondSeparator(){
  77                 return secondSeparator;
  78         }
  79
  80         /**
  81          * @param quantitativeDescriptionBuilder
  82          */
  83         public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
  84                 this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
  85         }
  86
  87         /**
  88          * @param categoricalDescriptionBuilder
  89          */
  90         public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
  91                 this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
  92         }
  93
  94         /**
  95          * @return the element processors of this generator
  96          */
  97         public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
  98                 return elementProcessors;
  99         }
 100
 101         /**
 102          * The keys of the elementProcessors map are regular expressions which are
 103          * being used to identify the those Descriptions to which the mapped
 104          * NaturalLanguageTextDataProcessor is applicable.
 105          *
 106          * @param elementProcessors
 107          */
 108         public void setElementProcessors(
 109                         Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
 110                 this.elementProcessors = elementProcessors;
 111         }
 112
 113         /**
 114          * Looks for technical annotations, if one matches a regular expression of the element processors
 115          * the associated processor is added to the applicable element processors which will then be applied
 116          * when generating the description.
 117          *
 118          * @param annotations the set of annotations of the description
 119          */
 120         private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
 121
 122                 if(annotations != null){
 123                         for(Annotation annotation : annotations){
 124                                 if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
 125                                         if (elementProcessors!=null){
 126                                                 for(String regex : elementProcessors.keySet()){
 127                                                         if(annotation.getText().matches(regex)){
 128                                                                 applicableElementProcessors.add(elementProcessors.get(regex));
 129                                                         }
 130                                                 }
 131                                         }
 132                                 }
 133                         }
 134                 }
 135         }
 136
 137
 138         /**
 139          * Applies the list of applicable processors to a TextData.
 140          *
 141          * @param textData the TextData to be modified
 142          * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
 143          */
 144         private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
 145                 for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
 146                         processor.process(textData, previousTextData);
 147                 }
 148         }
 149
 150
 151         /**
 152          * The most simple function to generate a description. The language used is the default one.
 153          *
 154          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 155          * @param description the TaxonDescription with all the data
 156          *
 157          * @return a list of TextData, each one being a basic element of the natural language description
 158          */
 159         @Override
 160     public List<TextData> generateNaturalLanguageDescription(TermTree featureTree,TaxonDescription description) {
 161                 return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
 162         }
 163
 164
 165
 166         /**
 167          * Generate a description in a specified language.
 168          *
 169          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 170          * @param description the TaxonDescription with all the data
 171          * @param language the language in which the description has to be printed
 172          *
 173          * @return a list of TextData, each one being a basic element of the natural language description
 174          */
 175         @Override
 176     public List<TextData> generateNaturalLanguageDescription(TermTree featureTree, TaxonDescription description,        Language language) {
 177                 List<Language> languages = new ArrayList<Language>();
 178                 languages.add(language);
 179                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 180                 return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
 181         }
 182
 183         /**
 184          * Generate a description with a specified list of preferred languages.
 185          *
 186          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 187          * @param description the TaxonDescription with all the data
 188          * @param languages the ordered list of languages preferred for printing the description
 189          *
 190          * @return a list of TextData, each one being a basic element of the natural language description
 191          */
 192         @Override
 193     public List<TextData> generatePreferredNaturalLanguageDescription(TermTree featureTree,TaxonDescription description, List<Language> languages) {
 194                 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
 195                 return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
 196         }
 197
 198         /**
 199          * Generate a description as a single paragraph in a TextData.
 200          *
 201          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 202          * @param description the TaxonDescription with all the data
 203          *
 204          * @return a TextData in the default language.
 205          */
 206         @Override
 207     public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description) {
 208                 return generateSingleTextData(featureTree,description,Language.DEFAULT());
 209         }
 210
 211         /**
 212          * Generate a description as a single paragraph in a TextData.
 213          *
 214          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 215          * @param description the TaxonDescription with all the data
 216          * @param language the language in which the description has to be printed
 217          *
 218          * @return a TextData in the specified language.
 219          */
 220         @Override
 221     public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description, Language language) {
 222                 List<Language> languages = new ArrayList<Language>();
 223                 languages.add(language);
 224                 return generatePreferredSingleTextData(featureTree,description,languages);
 225         }
 226
 227         /**
 228          * Generate a description with a specified list of preferred languages.
 229          *
 230          * @param featureTree the FeatureTree holding the order in which features and their states must be printed
 231          * @param description the TaxonDescription with all the data
 232          * @param languages the ordered list of languages preferred for printing the description
 233          *
 234          * @return a TextData using the languages (in the given order of preference)
 235          */
 236         @Override
 237     public TextData generatePreferredSingleTextData(TermTree featureTree, TaxonDescription description, List<Language> languages) {
 238                 levels.clear(); // before the start, the table containing the levels of each node must be cleared
 239                 // Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
 240                 List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
 241
 242                 StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
 243                 int i = 0,j,level; // i is used to store the index of the TextData to use
 244                 boolean startSentence = false, firstOne = true;
 245
 246                 for (j=0 ; j<levels.size() ; j++){
 247                         level = levels.get(j);
 248                         if (level==-1){
 249                                 if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
 250                                         descriptionStringBuilder.append(secondSeparator + " ");
 251                                         startSentence=true;
 252                                         firstOne=false;
 253                                         String asString = texts.get(i).getText(Language.DEFAULT()).toString();
 254                                         if (asString.length()>1) {
 255                         descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
 256                     }
 257                                 }
 258                                 i++;
 259                         }
 260                         else if (level==0) { // if this node is a leaf
 261                                 if (startSentence) {
 262                     descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
 263                 } else {
 264                     descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
 265                 }
 266                                 startSentence=false;
 267                                 i++;
 268                         }
 269                         else {
 270                                 if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
 271                                         if (i<texts.size()) {
 272                         descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
 273                     }
 274                                         i++;
 275                                 }
 276                         }
 277                 }
 278                 descriptionStringBuilder.append(secondSeparator);
 279                 String returnString = descriptionStringBuilder.toString();
 280                 returnString = StringUtils.replace(returnString, "  ", " ");
 281                 returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
 282                 return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
 283         }
 284
 285
 286
 287         /** recursive function that goes through a tree containing the order in which the description has to be generated,
 288          *  if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
 289          *
 290          * @param children the children of the feature node considered
 291          * @param parent the feature node considered
 292          * @param description the TaxonDescription element for which we want a natural language output
 293          * @param language The language in which the description has to be written
 294          * @param floor integer to keep track of the level in the tree
 295          * @return a list of TextData elements containing the part of description corresponding to the feature node considered
 296          */
 297         private List<TextData> buildBranchesDescr(List<TermNode> children, TermNode<Feature> parent, TaxonDescription description, List<Language> languages, int floor) {
 298                 List<TextData> listTextData = new ArrayList<TextData>();
 299                 floor++; // counter to know the current level in the tree
 300
 301                 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
 302                         levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
 303                         Feature feature = parent.getTerm();
 304                         TextData featureName;
 305                         if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
 306                                 featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 307                                 levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
 308                                 listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
 309                         }
 310             else {
 311                 featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
 312             }
 313
 314                         for (Iterator<TermNode> ifn = children.iterator() ; ifn.hasNext() ;){
 315                                 previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
 316                                 TermNode fn = ifn.next();
 317                                 listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
 318                         }
 319                 }
 320                 else { //once a leaf is reached
 321                         Feature feature = parent.getTerm();
 322                         if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
 323                                 Set<DescriptionElementBase> elements = description.getElements();
 324                                 for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
 325                                         DescriptionElementBase descriptionElement = deb.next();
 326                                         if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
 327                                                 if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
 328                                                         TextData featureTextData;
 329                                                         TextData statesTextData;
 330                                                         if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
 331                                                                 CategoricalData categoricalData = (CategoricalData) descriptionElement;
 332                                                                 statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
 333                                                                 featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
 334                                                         }
 335                                                         else { // if this description is a QuantitativeData, generate the according TextData
 336                                                                 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
 337                                                                 statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
 338                                                                 featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
 339                                                         }
 340                                                         applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
 341                                                         levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
 342                                                         listTextData.add(featureTextData);
 343                                                         levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
 344                                                         listTextData.add(statesTextData);
 345                                                 }
 346                                         }
 347                                 }
 348                         }
 349                 }
 350                 return listTextData;
 351         }
 352
 353 }