1
|
package eu.etaxonomy.cdm.api.service;
|
2
|
|
3
|
import java.util.ArrayList;
|
4
|
import java.util.HashSet;
|
5
|
import java.util.Iterator;
|
6
|
import java.util.List;
|
7
|
import java.util.Map;
|
8
|
import java.util.Set;
|
9
|
|
10
|
import org.apache.commons.lang.StringUtils;
|
11
|
import org.apache.log4j.Logger;
|
12
|
import org.springframework.stereotype.Component;
|
13
|
|
14
|
import eu.etaxonomy.cdm.model.description.CategoricalData;
|
15
|
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
|
16
|
import eu.etaxonomy.cdm.model.description.Feature;
|
17
|
import eu.etaxonomy.cdm.model.description.FeatureNode;
|
18
|
import eu.etaxonomy.cdm.model.description.FeatureTree;
|
19
|
import eu.etaxonomy.cdm.model.description.QuantitativeData;
|
20
|
import eu.etaxonomy.cdm.model.description.TaxonDescription;
|
21
|
import eu.etaxonomy.cdm.model.description.TextData;
|
22
|
import eu.etaxonomy.cdm.model.description.TextFormat;
|
23
|
import eu.etaxonomy.cdm.model.common.Annotation;
|
24
|
import eu.etaxonomy.cdm.model.common.AnnotationType;
|
25
|
import eu.etaxonomy.cdm.model.common.Language;
|
26
|
|
27
|
|
28
|
/**
|
29
|
* Generator of natural language descriptions from TaxonDescriptions.
|
30
|
*
|
31
|
* @author m.venin
|
32
|
* @created 13.04.2010
|
33
|
* @version 1.0
|
34
|
*/
|
35
|
@Component
|
36
|
public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
|
37
|
@SuppressWarnings("unused")
|
38
|
private static final Logger logger = Logger.getLogger(NaturalLanguageGenerator.class);
|
39
|
|
40
|
private String firstSeparator = ",";
|
41
|
private String secondSeparator = ".";
|
42
|
private List<Integer> levels = new ArrayList<Integer>();
|
43
|
|
44
|
private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
|
45
|
private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
|
46
|
|
47
|
private TextData previousTextData;
|
48
|
|
49
|
DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
|
50
|
|
51
|
private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
|
52
|
|
53
|
private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
|
54
|
|
55
|
/**
|
56
|
* Change the first separator used by generateSingleTextData. By default ",".
|
57
|
*
|
58
|
* @param separator
|
59
|
*/
|
60
|
public void setFirstSeparator(String separator){
|
61
|
firstSeparator=separator;
|
62
|
}
|
63
|
|
64
|
public String getFirstSeparator(){
|
65
|
return firstSeparator;
|
66
|
}
|
67
|
|
68
|
/**
|
69
|
* Change the second separator used by generateSingleTextData. By default ".".
|
70
|
*
|
71
|
* @param separator
|
72
|
*/
|
73
|
public void setSecondSeparator(String separator){
|
74
|
secondSeparator=separator;
|
75
|
}
|
76
|
|
77
|
public String getSecondSeparator(){
|
78
|
return secondSeparator;
|
79
|
}
|
80
|
|
81
|
/**
|
82
|
* @param quantitativeDescriptionBuilder
|
83
|
*/
|
84
|
public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
|
85
|
this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
|
86
|
}
|
87
|
|
88
|
/**
|
89
|
* @param categoricalDescriptionBuilder
|
90
|
*/
|
91
|
public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
|
92
|
this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
|
93
|
}
|
94
|
|
95
|
/**
|
96
|
* @return the element processors of this generator
|
97
|
*/
|
98
|
public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
|
99
|
return elementProcessors;
|
100
|
}
|
101
|
|
102
|
/**
|
103
|
* The keys of the elementProcessors map are regular expressions which are
|
104
|
* being used to identify the those Descriptions to which the mapped
|
105
|
* NaturalLanguageTextDataProcessor is applicable.
|
106
|
*
|
107
|
* @param elementProcessors
|
108
|
*/
|
109
|
public void setElementProcessors(
|
110
|
Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
|
111
|
this.elementProcessors = elementProcessors;
|
112
|
}
|
113
|
|
114
|
/**
|
115
|
* Looks for technical annotations, if one matches a regular expression of the element processors
|
116
|
* the associated processor is added to the applicable element processors which will then be applied
|
117
|
* when generating the description.
|
118
|
*
|
119
|
* @param annotations the set of annotations of the description
|
120
|
*/
|
121
|
private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
|
122
|
|
123
|
if(annotations != null){
|
124
|
for(Annotation annotation : annotations){
|
125
|
if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
|
126
|
if (elementProcessors!=null){
|
127
|
for(String regex : elementProcessors.keySet()){
|
128
|
if(annotation.getText().matches(regex)){
|
129
|
applicableElementProcessors.add(elementProcessors.get(regex));
|
130
|
}
|
131
|
}
|
132
|
}
|
133
|
}
|
134
|
}
|
135
|
}
|
136
|
}
|
137
|
|
138
|
|
139
|
/**
|
140
|
* Applies the list of applicable processors to a TextData.
|
141
|
*
|
142
|
* @param textData the TextData to be modified
|
143
|
* @param previousTextData the TextData corresponding to the feature of the previous level in the tree
|
144
|
*/
|
145
|
private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
|
146
|
for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
|
147
|
processor.process(textData, previousTextData);
|
148
|
}
|
149
|
}
|
150
|
|
151
|
|
152
|
/**
|
153
|
* The most simple function to generate a description. The language used is the default one.
|
154
|
*
|
155
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
156
|
* @param description the TaxonDescription with all the data
|
157
|
*
|
158
|
* @return a list of TextData, each one being a basic element of the natural language description
|
159
|
*/
|
160
|
public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description) {
|
161
|
return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
|
162
|
}
|
163
|
|
164
|
|
165
|
|
166
|
/**
|
167
|
* Generate a description in a specified language.
|
168
|
*
|
169
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
170
|
* @param description the TaxonDescription with all the data
|
171
|
* @param language the language in which the description has to be printed
|
172
|
*
|
173
|
* @return a list of TextData, each one being a basic element of the natural language description
|
174
|
*/
|
175
|
public List<TextData> generateNaturalLanguageDescription(FeatureTree featureTree, TaxonDescription description, Language language) {
|
176
|
List<Language> languages = new ArrayList<Language>();
|
177
|
languages.add(language);
|
178
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
179
|
return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
|
180
|
}
|
181
|
|
182
|
/**
|
183
|
* Generate a description with a specified list of preferred languages.
|
184
|
*
|
185
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
186
|
* @param description the TaxonDescription with all the data
|
187
|
* @param languages the ordered list of languages preferred for printing the description
|
188
|
*
|
189
|
* @return a list of TextData, each one being a basic element of the natural language description
|
190
|
*/
|
191
|
public List<TextData> generatePreferredNaturalLanguageDescription(FeatureTree featureTree,TaxonDescription description, List<Language> languages) {
|
192
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
193
|
return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
|
194
|
}
|
195
|
|
196
|
/**
|
197
|
* Generate a description as a single paragraph in a TextData.
|
198
|
*
|
199
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
200
|
* @param description the TaxonDescription with all the data
|
201
|
*
|
202
|
* @return a TextData in the default language.
|
203
|
*/
|
204
|
public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description) {
|
205
|
return generateSingleTextData(featureTree,description,Language.DEFAULT());
|
206
|
}
|
207
|
|
208
|
/**
|
209
|
* Generate a description as a single paragraph in a TextData.
|
210
|
*
|
211
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
212
|
* @param description the TaxonDescription with all the data
|
213
|
* @param language the language in which the description has to be printed
|
214
|
*
|
215
|
* @return a TextData in the specified language.
|
216
|
*/
|
217
|
public TextData generateSingleTextData(FeatureTree featureTree, TaxonDescription description, Language language) {
|
218
|
List<Language> languages = new ArrayList<Language>();
|
219
|
languages.add(language);
|
220
|
return generatePreferredSingleTextData(featureTree,description,languages);
|
221
|
}
|
222
|
|
223
|
/**
|
224
|
* Generate a description with a specified list of preferred languages.
|
225
|
*
|
226
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
227
|
* @param description the TaxonDescription with all the data
|
228
|
* @param languages the ordered list of languages preferred for printing the description
|
229
|
*
|
230
|
* @return a TextData using the languages (in the given order of preference)
|
231
|
*/
|
232
|
public TextData generatePreferredSingleTextData(FeatureTree featureTree, TaxonDescription description, List<Language> languages) {
|
233
|
levels.clear(); // before the start, the table containing the levels of each node must be cleared
|
234
|
// Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
|
235
|
List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
|
236
|
|
237
|
StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
|
238
|
int i = 0,j,level; // i is used to store the index of the TextData to use
|
239
|
boolean startSentence = false, firstOne = true;
|
240
|
|
241
|
for (j=0 ; j<levels.size() ; j++){
|
242
|
level = levels.get(j);
|
243
|
if (level==-1){
|
244
|
if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
|
245
|
descriptionStringBuilder.append(secondSeparator + " ");
|
246
|
startSentence=true;
|
247
|
firstOne=false;
|
248
|
String asString = texts.get(i).getText(Language.DEFAULT()).toString();
|
249
|
if (asString.length()>1) descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
|
250
|
}
|
251
|
i++;
|
252
|
}
|
253
|
else if (level==0) { // if this node is a leaf
|
254
|
if (startSentence) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
255
|
else descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
|
256
|
startSentence=false;
|
257
|
i++;
|
258
|
}
|
259
|
else {
|
260
|
if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
|
261
|
if (i<texts.size()) descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
262
|
i++;
|
263
|
}
|
264
|
}
|
265
|
}
|
266
|
descriptionStringBuilder.append(secondSeparator);
|
267
|
String returnString = descriptionStringBuilder.toString();
|
268
|
returnString = StringUtils.replace(returnString, " ", " ");
|
269
|
returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
|
270
|
return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
|
271
|
}
|
272
|
|
273
|
|
274
|
|
275
|
/** recursive function that goes through a tree containing the order in which the description has to be generated,
|
276
|
* if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
|
277
|
*
|
278
|
* @param children the children of the feature node considered
|
279
|
* @param parent the feature node considered
|
280
|
* @param description the TaxonDescription element for which we want a natural language output
|
281
|
* @param language The language in which the description has to be written
|
282
|
* @param floor integer to keep track of the level in the tree
|
283
|
* @return a list of TextData elements containing the part of description corresponding to the feature node considered
|
284
|
*/
|
285
|
private List<TextData> buildBranchesDescr(List<FeatureNode> children, FeatureNode parent, TaxonDescription description, List<Language> languages, int floor) {
|
286
|
List<TextData> listTextData = new ArrayList<TextData>();
|
287
|
floor++; // counter to know the current level in the tree
|
288
|
|
289
|
if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
|
290
|
levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
|
291
|
Feature feature = parent.getFeature();
|
292
|
TextData featureName;
|
293
|
if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
|
294
|
featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
295
|
levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
|
296
|
listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
|
297
|
}
|
298
|
else featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
|
299
|
|
300
|
for (Iterator<FeatureNode> ifn = children.iterator() ; ifn.hasNext() ;){
|
301
|
previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
|
302
|
FeatureNode fn = ifn.next();
|
303
|
listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
|
304
|
}
|
305
|
}
|
306
|
else { //once a leaf is reached
|
307
|
Feature feature = parent.getFeature();
|
308
|
if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
|
309
|
Set<DescriptionElementBase> elements = description.getElements();
|
310
|
for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
|
311
|
DescriptionElementBase descriptionElement = deb.next();
|
312
|
if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
|
313
|
if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
|
314
|
TextData featureTextData;
|
315
|
TextData statesTextData;
|
316
|
if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
|
317
|
CategoricalData categoricalData = (CategoricalData) descriptionElement;
|
318
|
statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
|
319
|
featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
320
|
}
|
321
|
else { // if this description is a QuantitativeData, generate the according TextData
|
322
|
QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
|
323
|
statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
|
324
|
featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
|
325
|
}
|
326
|
applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
|
327
|
levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
|
328
|
listTextData.add(featureTextData);
|
329
|
levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
|
330
|
listTextData.add(statesTextData);
|
331
|
}
|
332
|
}
|
333
|
}
|
334
|
}
|
335
|
}
|
336
|
return listTextData;
|
337
|
}
|
338
|
|
339
|
}
|