1
|
package eu.etaxonomy.cdm.api.service;
|
2
|
|
3
|
import java.util.ArrayList;
|
4
|
import java.util.HashSet;
|
5
|
import java.util.Iterator;
|
6
|
import java.util.List;
|
7
|
import java.util.Map;
|
8
|
import java.util.Set;
|
9
|
|
10
|
import org.apache.commons.lang.StringUtils;
|
11
|
import org.apache.log4j.Logger;
|
12
|
import org.springframework.stereotype.Component;
|
13
|
|
14
|
import eu.etaxonomy.cdm.format.description.DefaultCategoricalDescriptionBuilder;
|
15
|
import eu.etaxonomy.cdm.format.description.DefaultQuantitativeDescriptionBuilder;
|
16
|
import eu.etaxonomy.cdm.format.description.DescriptionBuilder;
|
17
|
import eu.etaxonomy.cdm.model.common.Annotation;
|
18
|
import eu.etaxonomy.cdm.model.common.AnnotationType;
|
19
|
import eu.etaxonomy.cdm.model.common.Language;
|
20
|
import eu.etaxonomy.cdm.model.description.CategoricalData;
|
21
|
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
|
22
|
import eu.etaxonomy.cdm.model.description.Feature;
|
23
|
import eu.etaxonomy.cdm.model.description.QuantitativeData;
|
24
|
import eu.etaxonomy.cdm.model.description.TaxonDescription;
|
25
|
import eu.etaxonomy.cdm.model.description.TextData;
|
26
|
import eu.etaxonomy.cdm.model.description.TextFormat;
|
27
|
import eu.etaxonomy.cdm.model.term.TermTree;
|
28
|
import eu.etaxonomy.cdm.model.term.TermNode;
|
29
|
|
30
|
|
31
|
/**
|
32
|
* Generator of natural language descriptions from TaxonDescriptions.
|
33
|
*
|
34
|
* @author m.venin
|
35
|
* @since 13.04.2010
|
36
|
*/
|
37
|
@Component
|
38
|
public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
|
39
|
@SuppressWarnings("unused")
|
40
|
private static final Logger logger = Logger.getLogger(NaturalLanguageGenerator.class);
|
41
|
|
42
|
private String firstSeparator = ",";
|
43
|
private String secondSeparator = ".";
|
44
|
private List<Integer> levels = new ArrayList<Integer>();
|
45
|
|
46
|
private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
|
47
|
private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
|
48
|
|
49
|
private TextData previousTextData;
|
50
|
|
51
|
DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
|
52
|
|
53
|
private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
|
54
|
|
55
|
private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
|
56
|
|
57
|
/**
|
58
|
* Change the first separator used by generateSingleTextData. By default ",".
|
59
|
*
|
60
|
* @param separator
|
61
|
*/
|
62
|
public void setFirstSeparator(String separator){
|
63
|
firstSeparator=separator;
|
64
|
}
|
65
|
|
66
|
public String getFirstSeparator(){
|
67
|
return firstSeparator;
|
68
|
}
|
69
|
|
70
|
/**
|
71
|
* Change the second separator used by generateSingleTextData. By default ".".
|
72
|
*
|
73
|
* @param separator
|
74
|
*/
|
75
|
public void setSecondSeparator(String separator){
|
76
|
secondSeparator=separator;
|
77
|
}
|
78
|
|
79
|
public String getSecondSeparator(){
|
80
|
return secondSeparator;
|
81
|
}
|
82
|
|
83
|
/**
|
84
|
* @param quantitativeDescriptionBuilder
|
85
|
*/
|
86
|
public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
|
87
|
this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
|
88
|
}
|
89
|
|
90
|
/**
|
91
|
* @param categoricalDescriptionBuilder
|
92
|
*/
|
93
|
public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
|
94
|
this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
|
95
|
}
|
96
|
|
97
|
/**
|
98
|
* @return the element processors of this generator
|
99
|
*/
|
100
|
public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
|
101
|
return elementProcessors;
|
102
|
}
|
103
|
|
104
|
/**
|
105
|
* The keys of the elementProcessors map are regular expressions which are
|
106
|
* being used to identify the those Descriptions to which the mapped
|
107
|
* NaturalLanguageTextDataProcessor is applicable.
|
108
|
*
|
109
|
* @param elementProcessors
|
110
|
*/
|
111
|
public void setElementProcessors(
|
112
|
Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
|
113
|
this.elementProcessors = elementProcessors;
|
114
|
}
|
115
|
|
116
|
/**
|
117
|
* Looks for technical annotations, if one matches a regular expression of the element processors
|
118
|
* the associated processor is added to the applicable element processors which will then be applied
|
119
|
* when generating the description.
|
120
|
*
|
121
|
* @param annotations the set of annotations of the description
|
122
|
*/
|
123
|
private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
|
124
|
|
125
|
if(annotations != null){
|
126
|
for(Annotation annotation : annotations){
|
127
|
if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
|
128
|
if (elementProcessors!=null){
|
129
|
for(String regex : elementProcessors.keySet()){
|
130
|
if(annotation.getText().matches(regex)){
|
131
|
applicableElementProcessors.add(elementProcessors.get(regex));
|
132
|
}
|
133
|
}
|
134
|
}
|
135
|
}
|
136
|
}
|
137
|
}
|
138
|
}
|
139
|
|
140
|
|
141
|
/**
|
142
|
* Applies the list of applicable processors to a TextData.
|
143
|
*
|
144
|
* @param textData the TextData to be modified
|
145
|
* @param previousTextData the TextData corresponding to the feature of the previous level in the tree
|
146
|
*/
|
147
|
private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
|
148
|
for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
|
149
|
processor.process(textData, previousTextData);
|
150
|
}
|
151
|
}
|
152
|
|
153
|
|
154
|
/**
|
155
|
* The most simple function to generate a description. The language used is the default one.
|
156
|
*
|
157
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
158
|
* @param description the TaxonDescription with all the data
|
159
|
*
|
160
|
* @return a list of TextData, each one being a basic element of the natural language description
|
161
|
*/
|
162
|
@Override
|
163
|
public List<TextData> generateNaturalLanguageDescription(TermTree featureTree,TaxonDescription description) {
|
164
|
return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
|
165
|
}
|
166
|
|
167
|
|
168
|
|
169
|
/**
|
170
|
* Generate a description in a specified language.
|
171
|
*
|
172
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
173
|
* @param description the TaxonDescription with all the data
|
174
|
* @param language the language in which the description has to be printed
|
175
|
*
|
176
|
* @return a list of TextData, each one being a basic element of the natural language description
|
177
|
*/
|
178
|
@Override
|
179
|
public List<TextData> generateNaturalLanguageDescription(TermTree featureTree, TaxonDescription description, Language language) {
|
180
|
List<Language> languages = new ArrayList<Language>();
|
181
|
languages.add(language);
|
182
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
183
|
return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
|
184
|
}
|
185
|
|
186
|
/**
|
187
|
* Generate a description with a specified list of preferred languages.
|
188
|
*
|
189
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
190
|
* @param description the TaxonDescription with all the data
|
191
|
* @param languages the ordered list of languages preferred for printing the description
|
192
|
*
|
193
|
* @return a list of TextData, each one being a basic element of the natural language description
|
194
|
*/
|
195
|
@Override
|
196
|
public List<TextData> generatePreferredNaturalLanguageDescription(TermTree featureTree,TaxonDescription description, List<Language> languages) {
|
197
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
198
|
return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
|
199
|
}
|
200
|
|
201
|
/**
|
202
|
* Generate a description as a single paragraph in a TextData.
|
203
|
*
|
204
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
205
|
* @param description the TaxonDescription with all the data
|
206
|
*
|
207
|
* @return a TextData in the default language.
|
208
|
*/
|
209
|
@Override
|
210
|
public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description) {
|
211
|
return generateSingleTextData(featureTree,description,Language.DEFAULT());
|
212
|
}
|
213
|
|
214
|
/**
|
215
|
* Generate a description as a single paragraph in a TextData.
|
216
|
*
|
217
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
218
|
* @param description the TaxonDescription with all the data
|
219
|
* @param language the language in which the description has to be printed
|
220
|
*
|
221
|
* @return a TextData in the specified language.
|
222
|
*/
|
223
|
@Override
|
224
|
public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description, Language language) {
|
225
|
List<Language> languages = new ArrayList<Language>();
|
226
|
languages.add(language);
|
227
|
return generatePreferredSingleTextData(featureTree,description,languages);
|
228
|
}
|
229
|
|
230
|
/**
|
231
|
* Generate a description with a specified list of preferred languages.
|
232
|
*
|
233
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
234
|
* @param description the TaxonDescription with all the data
|
235
|
* @param languages the ordered list of languages preferred for printing the description
|
236
|
*
|
237
|
* @return a TextData using the languages (in the given order of preference)
|
238
|
*/
|
239
|
@Override
|
240
|
public TextData generatePreferredSingleTextData(TermTree featureTree, TaxonDescription description, List<Language> languages) {
|
241
|
levels.clear(); // before the start, the table containing the levels of each node must be cleared
|
242
|
// Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
|
243
|
List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
|
244
|
|
245
|
StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
|
246
|
int i = 0,j,level; // i is used to store the index of the TextData to use
|
247
|
boolean startSentence = false, firstOne = true;
|
248
|
|
249
|
for (j=0 ; j<levels.size() ; j++){
|
250
|
level = levels.get(j);
|
251
|
if (level==-1){
|
252
|
if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
|
253
|
descriptionStringBuilder.append(secondSeparator + " ");
|
254
|
startSentence=true;
|
255
|
firstOne=false;
|
256
|
String asString = texts.get(i).getText(Language.DEFAULT()).toString();
|
257
|
if (asString.length()>1) {
|
258
|
descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
|
259
|
}
|
260
|
}
|
261
|
i++;
|
262
|
}
|
263
|
else if (level==0) { // if this node is a leaf
|
264
|
if (startSentence) {
|
265
|
descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
266
|
} else {
|
267
|
descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
|
268
|
}
|
269
|
startSentence=false;
|
270
|
i++;
|
271
|
}
|
272
|
else {
|
273
|
if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
|
274
|
if (i<texts.size()) {
|
275
|
descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
276
|
}
|
277
|
i++;
|
278
|
}
|
279
|
}
|
280
|
}
|
281
|
descriptionStringBuilder.append(secondSeparator);
|
282
|
String returnString = descriptionStringBuilder.toString();
|
283
|
returnString = StringUtils.replace(returnString, " ", " ");
|
284
|
returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
|
285
|
return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
|
286
|
}
|
287
|
|
288
|
|
289
|
|
290
|
/** recursive function that goes through a tree containing the order in which the description has to be generated,
|
291
|
* if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
|
292
|
*
|
293
|
* @param children the children of the feature node considered
|
294
|
* @param parent the feature node considered
|
295
|
* @param description the TaxonDescription element for which we want a natural language output
|
296
|
* @param language The language in which the description has to be written
|
297
|
* @param floor integer to keep track of the level in the tree
|
298
|
* @return a list of TextData elements containing the part of description corresponding to the feature node considered
|
299
|
*/
|
300
|
private List<TextData> buildBranchesDescr(List<TermNode> children, TermNode<Feature> parent, TaxonDescription description, List<Language> languages, int floor) {
|
301
|
List<TextData> listTextData = new ArrayList<TextData>();
|
302
|
floor++; // counter to know the current level in the tree
|
303
|
|
304
|
if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
|
305
|
levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
|
306
|
Feature feature = parent.getTerm();
|
307
|
TextData featureName;
|
308
|
if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
|
309
|
featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
310
|
levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
|
311
|
listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
|
312
|
}
|
313
|
else {
|
314
|
featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
|
315
|
}
|
316
|
|
317
|
for (Iterator<TermNode> ifn = children.iterator() ; ifn.hasNext() ;){
|
318
|
previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
|
319
|
TermNode fn = ifn.next();
|
320
|
listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
|
321
|
}
|
322
|
}
|
323
|
else { //once a leaf is reached
|
324
|
Feature feature = parent.getTerm();
|
325
|
if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
|
326
|
Set<DescriptionElementBase> elements = description.getElements();
|
327
|
for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
|
328
|
DescriptionElementBase descriptionElement = deb.next();
|
329
|
if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
|
330
|
if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
|
331
|
TextData featureTextData;
|
332
|
TextData statesTextData;
|
333
|
if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
|
334
|
CategoricalData categoricalData = (CategoricalData) descriptionElement;
|
335
|
statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
|
336
|
featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
337
|
}
|
338
|
else { // if this description is a QuantitativeData, generate the according TextData
|
339
|
QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
|
340
|
statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
|
341
|
featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
|
342
|
}
|
343
|
applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
|
344
|
levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
|
345
|
listTextData.add(featureTextData);
|
346
|
levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
|
347
|
listTextData.add(statesTextData);
|
348
|
}
|
349
|
}
|
350
|
}
|
351
|
}
|
352
|
}
|
353
|
return listTextData;
|
354
|
}
|
355
|
|
356
|
}
|