Last updates for natural language generation (added comments, new options, cleaned...
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / NaturalLanguageGenerator.java
1 package eu.etaxonomy.cdm.api.service;
2
3 import java.util.ArrayList;
4 import java.util.HashSet;
5 import java.util.Iterator;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.Set;
9
10 import org.apache.commons.lang.StringUtils;
11 import org.springframework.stereotype.Component;
12
13 import eu.etaxonomy.cdm.model.description.CategoricalData;
14 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
15 import eu.etaxonomy.cdm.model.description.Feature;
16 import eu.etaxonomy.cdm.model.description.FeatureNode;
17 import eu.etaxonomy.cdm.model.description.FeatureTree;
18 import eu.etaxonomy.cdm.model.description.QuantitativeData;
19 import eu.etaxonomy.cdm.model.description.TaxonDescription;
20 import eu.etaxonomy.cdm.model.description.TextData;
21 import eu.etaxonomy.cdm.model.description.TextFormat;
22 import eu.etaxonomy.cdm.model.common.Annotation;
23 import eu.etaxonomy.cdm.model.common.AnnotationType;
24 import eu.etaxonomy.cdm.model.common.Language;
25
26
27 /**
28 * Generator of natural language descriptions from TaxonDescriptions.
29 *
30 * @author m.venin
31 * @created 13.04.2010
32 * @version 1.0
33 */
34 @Component
35 public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
36
37 private String firstSeparator = ",";
38 private String secondSeparator = ".";
39 private List<Integer> levels = new ArrayList<Integer>();
40
41 private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
42 private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
43
44 private TextData previousTextData;
45
46 private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
47
48 private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
49
50 /**
51 * Change the first separator used by generateSingleTextData. By default ",".
52 *
53 * @param separator
54 */
55 public void setFirstSeparator(String separator){
56 firstSeparator=separator;
57 }
58
59 public String getFirstSeparator(){
60 return firstSeparator;
61 }
62
63 /**
64 * Change the second separator used by generateSingleTextData. By default ".".
65 *
66 * @param separator
67 */
68 public void setSecondSeparator(String separator){
69 secondSeparator=separator;
70 }
71
72 public String getSecondSeparator(){
73 return secondSeparator;
74 }
75
76 /**
77 * @param quantitativeDescriptionBuilder
78 */
79 public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
80 this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
81 }
82
83 /**
84 * @param categoricalDescriptionBuilder
85 */
86 public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
87 this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
88 }
89
90 /**
91 * @return the element processors of this generator
92 */
93 public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
94 return elementProcessors;
95 }
96
97 /**
98 * The keys of the elementProcessors map are regular expressions which are
99 * being used to identify the those Descriptions to which the mapped
100 * NaturalLanguageTextDataProcessor is applicable.
101 *
102 * @param elementProcessors
103 */
104 public void setElementProcessors(
105 Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
106 this.elementProcessors = elementProcessors;
107 }
108
109 /**
110 * Looks for technical annotations, if one matches a regular expression of the element processors
111 * the associated processor is added to the applicable element processors which will then be applied
112 * when generating the description.
113 *
114 * @param annotations the set of annotations of the description
115 */
116 private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
117
118 if(annotations != null){
119 for(Annotation annotation : annotations){
120 if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
121 if (elementProcessors!=null){
122 for(String regex : elementProcessors.keySet()){
123 if(annotation.getText().matches(regex)){
124 applicableElementProcessors.add(elementProcessors.get(regex));
125 }
126 }
127 }
128 }
129 }
130 }
131 }
132
133
134 /**
135 * Applies the list of applicable processors to a TextData.
136 *
137 * @param textData the TextData to be modified
138 * @param previousTextData the TextData corresponding to the feature of the previous level in the tree
139 */
140 private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
141 for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
142 processor.process(textData, previousTextData);
143 }
144 }
145
146
147 /**
148 * The most simple function to generate a description. The language used is the default one.
149 *
150 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
151 * @param description the TaxonDescription with all the data
152 *
153 * @return a list of TextData, each one being a basic element of the natural language description
154 */
155 public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
156 return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
157 }
158
159
160
161 /**
162 * Generate a description in a specified language.
163 *
164 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
165 * @param description the TaxonDescription with all the data
166 * @param language the language in which the description has to be printed
167 *
168 * @return a list of TextData, each one being a basic element of the natural language description
169 */
170 public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
171 List<Language> languages = new ArrayList<Language>();
172 languages.add(language);
173 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
174 return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
175 }
176
177 /**
178 * Generate a description with a specified list of preferred languages.
179 *
180 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
181 * @param description the TaxonDescription with all the data
182 * @param languages the ordered list of languages preferred for printing the description
183 *
184 * @return a list of TextData, each one being a basic element of the natural language description
185 */
186 public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
187 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
188 return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
189 }
190
191 /**
192 * Generate a description as a single paragraph in a TextData.
193 *
194 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
195 * @param description the TaxonDescription with all the data
196 *
197 * @return a TextData in the default language.
198 */
199 public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description) {
200 return generateSingleTextData(featureTree,description,Language.DEFAULT());
201 }
202
203 /**
204 * Generate a description as a single paragraph in a TextData.
205 *
206 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
207 * @param description the TaxonDescription with all the data
208 * @param language the language in which the description has to be printed
209 *
210 * @return a TextData in the specified language.
211 */
212 public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description, Language language) {
213 List<Language> languages = new ArrayList<Language>();
214 languages.add(language);
215 return generatePreferredSingleTextData(featureTree,description,languages);
216 }
217
218 /**
219 * Generate a description with a specified list of preferred languages.
220 *
221 * @param featureTree the FeatureTree holding the order in which features and their states must be printed
222 * @param description the TaxonDescription with all the data
223 * @param languages the ordered list of languages preferred for printing the description
224 *
225 * @return a TextData using the languages (in the given order of preference)
226 */
227 public TextData generatePreferredSingleTextData(FeatureTree featureTree, TaxonDescription description, List<Language> languages) {
228
229 initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
230
231 List<TextData> texts = buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
232 StringBuilder sb = new StringBuilder();
233 int i = 0,j;
234 boolean startSentence = false, firstOne = true;
235 Integer level;
236 levels.clear();
237 for (j=0 ; j<levels.size() ; j++){
238 level = levels.get(j);
239 if (level.equals(-1)){
240 if ((j+1)<levels.size() && levels.get(j+1).equals(0)){
241 if (!firstOne) sb.append(secondSeparator + " ");
242 startSentence=true;
243 firstOne=false;
244 String asString = texts.get(i).getText(Language.DEFAULT()).toString();
245 if (asString.length()>1) sb.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
246 }
247 i++;
248 }
249 else if (level.equals(0)) {
250 if (startSentence) sb.append(texts.get(i).getText(Language.DEFAULT()));
251 else sb.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
252 startSentence=false;
253 i++;
254 }
255 else if (!level.equals(0) && !level.equals(-1)){
256 if (!firstOne && levels.get(j-1).equals(0)){
257 if (i<texts.size()) sb.append(texts.get(i).getText(Language.DEFAULT()));
258 i++;
259 }
260 }
261 }
262 sb.append(secondSeparator);
263 String returnString = sb.toString();
264 returnString = StringUtils.replace(returnString, " ", " ");
265 returnString = StringUtils.removeStart(returnString, secondSeparator);
266 return TextData.NewInstance(StringUtils.replace(sb.toString(), " ", " "),Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
267 }
268
269
270
271 /** recursive function that goes through a tree containing the order in which the description has to be generated,
272 * if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
273 *
274 * @param children the children of the feature node considered
275 * @param parent the feature node considered
276 * @param description the TaxonDescription element for which we want a natural language output
277 * @param language The language in which the description has to be written
278 * @return a list of TextData elements containing the part of description corresponding to the feature node considered
279 */
280 private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages, int floor) {
281 List<TextData> listTextData = new ArrayList<TextData>();
282 floor++; // counter to know the current level in the tree
283
284 if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
285 levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
286 Feature feature = parent.getFeature();
287 TextData featureName;
288 if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
289 featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
290 levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
291 listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
292 }
293 else featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
294
295 for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
296 previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
297 FeatureNode fn = ifn.next();
298 listTextData.addAll(buildBranchesDescr(fn.getChildren(),fn,description, languages, floor));
299 }
300 }
301 else { //once a leaf is reached
302 Feature feature = parent.getFeature();
303 if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
304 Set<DescriptionElementBase> elements = description.getElements();
305 for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
306 DescriptionElementBase descriptionElement = deb.next();
307 if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
308 if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
309 TextData featureTextData;
310 TextData statesTextData;
311 if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
312 CategoricalData categoricalData = (CategoricalData) descriptionElement;
313 statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
314 featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
315 }
316 else { // if this description is a QuantitativeData, generate the according TextData
317 QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
318 statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
319 featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
320 }
321 applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
322 levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
323 listTextData.add(featureTextData);
324 levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
325 listTextData.add(statesTextData);
326 }
327 }
328 }
329 }
330 }
331 return listTextData;
332 }
333
334 }