Merge branch 'develop' of ssh://dev.e-taxonomy.eu/var/git/cdmlib into develop
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / NaturalLanguageGenerator.java
1 package eu.etaxonomy.cdm.api.service;
2
3 import java.util.ArrayList;
4 import java.util.HashSet;
5 import java.util.Iterator;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.Set;
9
10 import org.apache.commons.lang.StringUtils;
11 import org.apache.log4j.Logger;
12 import org.springframework.stereotype.Component;
13
14 import eu.etaxonomy.cdm.model.common.Annotation;
15 import eu.etaxonomy.cdm.model.common.AnnotationType;
16 import eu.etaxonomy.cdm.model.common.Language;
17 import eu.etaxonomy.cdm.model.description.CategoricalData;
18 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
19 import eu.etaxonomy.cdm.model.description.Feature;
20 import eu.etaxonomy.cdm.model.description.QuantitativeData;
21 import eu.etaxonomy.cdm.model.description.TaxonDescription;
22 import eu.etaxonomy.cdm.model.description.TextData;
23 import eu.etaxonomy.cdm.model.description.TextFormat;
24 import eu.etaxonomy.cdm.model.term.TermTree;
25 import eu.etaxonomy.cdm.model.term.TermNode;
26
27
28 /**
29 * Generator of natural language descriptions from TaxonDescriptions.
30 *
31 * @author m.venin
32 * @since 13.04.2010
33 */
34 @Component
35 public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
36 @SuppressWarnings("unused")
37 private static final Logger logger = Logger.getLogger(NaturalLanguageGenerator.class);
38
39 private String firstSeparator = ",";
40 private String secondSeparator = ".";
41 private List<Integer> levels = new ArrayList<Integer>();
42
43 private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
44 private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
45
46 private TextData previousTextData;
47
48 DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
49
50 private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
51
52 private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
53
54 /**
55 * Change the first separator used by generateSingleTextData. By default ",".
56 *
57 * @param separator
58 */
59 public void setFirstSeparator(String separator){
60 firstSeparator=separator;
61 }
62
63 public String getFirstSeparator(){
64 return firstSeparator;
65 }
66
67 /**
68 * Change the second separator used by generateSingleTextData. By default ".".
69 *
70 * @param separator
71 */
72 public void setSecondSeparator(String separator){
73 secondSeparator=separator;
74 }
75
76 public String getSecondSeparator(){
77 return secondSeparator;
78 }
79
80 /**
81 * @param quantitativeDescriptionBuilder
82 */
83 public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
84 this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
85 }
86
87 /**
88 * @param categoricalDescriptionBuilder
89 */
90 public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
91 this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
92 }
93
94 /**
95 * @return the element processors of this generator
96 */
97 public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
98 return elementProcessors;
99 }
100
101 /**
102 * The keys of the elementProcessors map are regular expressions which are
103 * being used to identify the those Descriptions to which the mapped
104 * NaturalLanguageTextDataProcessor is applicable.
105 *
106 * @param elementProcessors
107 */
108 public void setElementProcessors(
109 Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
110 this.elementProcessors = elementProcessors;
111 }
112
113 /**
114 * Looks for technical annotations, if one matches a regular expression of the element processors
115 * the associated processor is added to the applicable element processors which will then be applied
116 * when generating the description.
117 *
118 * @param annotations the set of annotations of the description
119 */
120 private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
121
122 if(annotations != null){
123 for(Annotation annotation : annotations){
124 if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
125 if (elementProcessors!=null){
126 for(String regex : elementProcessors.keySet()){
127 if(annotation.getText().matches(regex)){
128 applicableElementProcessors.add(elementProcessors.get(regex));
129 }
130 }
131 }
132 }
133 }
134 }
135 }
136
137
138 /**
139 * Applies the list of applicable processors to a TextData.
140 *
141 * @param textData the TextData to be modified
142 * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
143 */
144 private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
145 for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
146 processor.process(textData, previousTextData);
147 }
148 }
149
150
151 /**
152 * The most simple function to generate a description. The language used is the default one.
153 *
154 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
155 * @param description the TaxonDescription with all the data
156 *
157 * @return a list of TextData, each one being a basic element of the natural language description
158 */
159 @Override
160 public List<TextData> generateNaturalLanguageDescription(TermTree featureTree,TaxonDescription description) {
161 return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
162 }
163
164
165
166 /**
167 * Generate a description in a specified language.
168 *
169 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
170 * @param description the TaxonDescription with all the data
171 * @param language the language in which the description has to be printed
172 *
173 * @return a list of TextData, each one being a basic element of the natural language description
174 */
175 @Override
176 public List<TextData> generateNaturalLanguageDescription(TermTree featureTree, TaxonDescription description, Language language) {
177 List<Language> languages = new ArrayList<Language>();
178 languages.add(language);
179 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
180 return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
181 }
182
183 /**
184 * Generate a description with a specified list of preferred languages.
185 *
186 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
187 * @param description the TaxonDescription with all the data
188 * @param languages the ordered list of languages preferred for printing the description
189 *
190 * @return a list of TextData, each one being a basic element of the natural language description
191 */
192 @Override
193 public List<TextData> generatePreferredNaturalLanguageDescription(TermTree featureTree,TaxonDescription description, List<Language> languages) {
194 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
195 return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
196 }
197
198 /**
199 * Generate a description as a single paragraph in a TextData.
200 *
201 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
202 * @param description the TaxonDescription with all the data
203 *
204 * @return a TextData in the default language.
205 */
206 @Override
207 public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description) {
208 return generateSingleTextData(featureTree,description,Language.DEFAULT());
209 }
210
211 /**
212 * Generate a description as a single paragraph in a TextData.
213 *
214 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
215 * @param description the TaxonDescription with all the data
216 * @param language the language in which the description has to be printed
217 *
218 * @return a TextData in the specified language.
219 */
220 @Override
221 public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description, Language language) {
222 List<Language> languages = new ArrayList<Language>();
223 languages.add(language);
224 return generatePreferredSingleTextData(featureTree,description,languages);
225 }
226
227 /**
228 * Generate a description with a specified list of preferred languages.
229 *
230 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
231 * @param description the TaxonDescription with all the data
232 * @param languages the ordered list of languages preferred for printing the description
233 *
234 * @return a TextData using the languages (in the given order of preference)
235 */
236 @Override
237 public TextData generatePreferredSingleTextData(TermTree featureTree, TaxonDescription description, List<Language> languages) {
238 levels.clear(); // before the start, the table containing the levels of each node must be cleared
239 // Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
240 List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
241
242 StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
243 int i = 0,j,level; // i is used to store the index of the TextData to use
244 boolean startSentence = false, firstOne = true;
245
246 for (j=0 ; j<levels.size() ; j++){
247 level = levels.get(j);
248 if (level==-1){
249 if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
250 descriptionStringBuilder.append(secondSeparator + " ");
251 startSentence=true;
252 firstOne=false;
253 String asString = texts.get(i).getText(Language.DEFAULT()).toString();
254 if (asString.length()>1) {
255 descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
256 }
257 }
258 i++;
259 }
260 else if (level==0) { // if this node is a leaf
261 if (startSentence) {
262 descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
263 } else {
264 descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
265 }
266 startSentence=false;
267 i++;
268 }
269 else {
270 if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
271 if (i<texts.size()) {
272 descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
273 }
274 i++;
275 }
276 }
277 }
278 descriptionStringBuilder.append(secondSeparator);
279 String returnString = descriptionStringBuilder.toString();
280 returnString = StringUtils.replace(returnString, " ", " ");
281 returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
282 return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
283 }
284
285
286
287 /** recursive function that goes through a tree containing the order in which the description has to be generated,
288 * if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
289 *
290 * @param children the children of the feature node considered
291 * @param parent the feature node considered
292 * @param description the TaxonDescription element for which we want a natural language output
293 * @param language The language in which the description has to be written
294 * @param floor integer to keep track of the level in the tree
295 * @return a list of TextData elements containing the part of description corresponding to the feature node considered
296 */
297 private List<TextData> buildBranchesDescr(List<TermNode> children, TermNode<Feature> parent, TaxonDescription description, List<Language> languages, int floor) {
298 List<TextData> listTextData = new ArrayList<TextData>();
299 floor++; // counter to know the current level in the tree
300
301 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
302 levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
303 Feature feature = parent.getTerm();
304 TextData featureName;
305 if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
306 featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
307 levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
308 listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
309 }
310 else {
311 featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
312 }
313
314 for (Iterator<TermNode> ifn = children.iterator() ; ifn.hasNext() ;){
315 previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
316 TermNode fn = ifn.next();
317 listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
318 }
319 }
320 else { //once a leaf is reached
321 Feature feature = parent.getTerm();
322 if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
323 Set<DescriptionElementBase> elements = description.getElements();
324 for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
325 DescriptionElementBase descriptionElement = deb.next();
326 if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
327 if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
328 TextData featureTextData;
329 TextData statesTextData;
330 if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
331 CategoricalData categoricalData = (CategoricalData) descriptionElement;
332 statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
333 featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
334 }
335 else { // if this description is a QuantitativeData, generate the according TextData
336 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
337 statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
338 featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
339 }
340 applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
341 levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
342 listTextData.add(featureTextData);
343 levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
344 listTextData.add(statesTextData);
345 }
346 }
347 }
348 }
349 }
350 return listTextData;
351 }
352
353 }