1
|
package eu.etaxonomy.cdm.api.service;
|
2
|
|
3
|
import java.util.ArrayList;
|
4
|
import java.util.HashSet;
|
5
|
import java.util.Iterator;
|
6
|
import java.util.List;
|
7
|
import java.util.Map;
|
8
|
import java.util.Set;
|
9
|
|
10
|
import org.apache.commons.lang.StringUtils;
|
11
|
import org.apache.logging.log4j.LogManager;import org.apache.logging.log4j.Logger;
|
12
|
import org.springframework.stereotype.Component;
|
13
|
|
14
|
import eu.etaxonomy.cdm.format.description.DefaultCategoricalDescriptionBuilder;
|
15
|
import eu.etaxonomy.cdm.format.description.DefaultQuantitativeDescriptionBuilder;
|
16
|
import eu.etaxonomy.cdm.format.description.DescriptionBuilder;
|
17
|
import eu.etaxonomy.cdm.model.common.Annotation;
|
18
|
import eu.etaxonomy.cdm.model.common.AnnotationType;
|
19
|
import eu.etaxonomy.cdm.model.common.Language;
|
20
|
import eu.etaxonomy.cdm.model.description.CategoricalData;
|
21
|
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
|
22
|
import eu.etaxonomy.cdm.model.description.Feature;
|
23
|
import eu.etaxonomy.cdm.model.description.QuantitativeData;
|
24
|
import eu.etaxonomy.cdm.model.description.TaxonDescription;
|
25
|
import eu.etaxonomy.cdm.model.description.TextData;
|
26
|
import eu.etaxonomy.cdm.model.description.TextFormat;
|
27
|
import eu.etaxonomy.cdm.model.term.TermTree;
|
28
|
import eu.etaxonomy.cdm.model.term.TermNode;
|
29
|
|
30
|
|
31
|
/**
|
32
|
* Generator of natural language descriptions from TaxonDescriptions.
|
33
|
*
|
34
|
* @author m.venin
|
35
|
* @since 13.04.2010
|
36
|
*/
|
37
|
@Component
|
38
|
public class NaturalLanguageGenerator implements INaturalLanguageGenerator {
|
39
|
|
40
|
@SuppressWarnings("unused")
|
41
|
private static final Logger logger = LogManager.getLogger(NaturalLanguageGenerator.class);
|
42
|
|
43
|
private String firstSeparator = ",";
|
44
|
private String secondSeparator = ".";
|
45
|
private List<Integer> levels = new ArrayList<>();
|
46
|
|
47
|
private DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder = new DefaultQuantitativeDescriptionBuilder();
|
48
|
private DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder = new DefaultCategoricalDescriptionBuilder();
|
49
|
|
50
|
private TextData previousTextData;
|
51
|
|
52
|
private DeltaTextDataProcessor deltaTextDataProcessor = new DeltaTextDataProcessor();
|
53
|
|
54
|
private Map<String, INaturalLanguageTextDataProcessor> elementProcessors;
|
55
|
|
56
|
private Set<INaturalLanguageTextDataProcessor> applicableElementProcessors = new HashSet<INaturalLanguageTextDataProcessor>();
|
57
|
|
58
|
/**
|
59
|
* Change the first separator used by generateSingleTextData. By default ",".
|
60
|
*
|
61
|
* @param separator
|
62
|
*/
|
63
|
public void setFirstSeparator(String separator){
|
64
|
firstSeparator=separator;
|
65
|
}
|
66
|
|
67
|
public String getFirstSeparator(){
|
68
|
return firstSeparator;
|
69
|
}
|
70
|
|
71
|
/**
|
72
|
* Change the second separator used by generateSingleTextData. By default ".".
|
73
|
*
|
74
|
* @param separator
|
75
|
*/
|
76
|
public void setSecondSeparator(String separator){
|
77
|
secondSeparator=separator;
|
78
|
}
|
79
|
|
80
|
public String getSecondSeparator(){
|
81
|
return secondSeparator;
|
82
|
}
|
83
|
|
84
|
/**
|
85
|
* @param quantitativeDescriptionBuilder
|
86
|
*/
|
87
|
public void setQuantitativeDescriptionBuilder(DescriptionBuilder<QuantitativeData> quantitativeDescriptionBuilder){
|
88
|
this.quantitativeDescriptionBuilder = quantitativeDescriptionBuilder;
|
89
|
}
|
90
|
|
91
|
/**
|
92
|
* @param categoricalDescriptionBuilder
|
93
|
*/
|
94
|
public void setCategoricalDescriptionBuilder(DescriptionBuilder<CategoricalData> categoricalDescriptionBuilder){
|
95
|
this.categoricalDescriptionBuilder = categoricalDescriptionBuilder;
|
96
|
}
|
97
|
|
98
|
/**
|
99
|
* @return the element processors of this generator
|
100
|
*/
|
101
|
public Map<String, INaturalLanguageTextDataProcessor> getElementProcessors() {
|
102
|
return elementProcessors;
|
103
|
}
|
104
|
|
105
|
/**
|
106
|
* The keys of the elementProcessors map are regular expressions which are
|
107
|
* being used to identify the those Descriptions to which the mapped
|
108
|
* NaturalLanguageTextDataProcessor is applicable.
|
109
|
*
|
110
|
* @param elementProcessors
|
111
|
*/
|
112
|
public void setElementProcessors(
|
113
|
Map<String, INaturalLanguageTextDataProcessor> elementProcessors) {
|
114
|
this.elementProcessors = elementProcessors;
|
115
|
}
|
116
|
|
117
|
/**
|
118
|
* Looks for technical annotations, if one matches a regular expression of the element processors
|
119
|
* the associated processor is added to the applicable element processors which will then be applied
|
120
|
* when generating the description.
|
121
|
*
|
122
|
* @param annotations the set of annotations of the description
|
123
|
*/
|
124
|
private void initNaturalLanguageDescriptionElementProcessors(Set<Annotation> annotations) {
|
125
|
|
126
|
if(annotations != null){
|
127
|
for(Annotation annotation : annotations){
|
128
|
if(annotation.getAnnotationType().equals(AnnotationType.TECHNICAL())){
|
129
|
if (elementProcessors!=null){
|
130
|
for(String regex : elementProcessors.keySet()){
|
131
|
if(annotation.getText().matches(regex)){
|
132
|
applicableElementProcessors.add(elementProcessors.get(regex));
|
133
|
}
|
134
|
}
|
135
|
}
|
136
|
}
|
137
|
}
|
138
|
}
|
139
|
}
|
140
|
|
141
|
|
142
|
/**
|
143
|
* Applies the list of applicable processors to a TextData.
|
144
|
*
|
145
|
* @param textData the TextData to be modified
|
146
|
* @param previousTextData the TextData corresponding to the feature of the previous level in the tree
|
147
|
*/
|
148
|
private void applyNaturalLanguageDescriptionElementProcessors(TextData textData, TextData previousTextData){
|
149
|
for(INaturalLanguageTextDataProcessor processor : applicableElementProcessors){
|
150
|
processor.process(textData, previousTextData);
|
151
|
}
|
152
|
}
|
153
|
|
154
|
|
155
|
/**
|
156
|
* The most simple function to generate a description. The language used is the default one.
|
157
|
*
|
158
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
159
|
* @param description the TaxonDescription with all the data
|
160
|
*
|
161
|
* @return a list of TextData, each one being a basic element of the natural language description
|
162
|
*/
|
163
|
@Override
|
164
|
public List<TextData> generateNaturalLanguageDescription(TermTree featureTree,TaxonDescription description) {
|
165
|
return generateNaturalLanguageDescription(featureTree,description,Language.DEFAULT());
|
166
|
}
|
167
|
|
168
|
|
169
|
|
170
|
/**
|
171
|
* Generate a description in a specified language.
|
172
|
*
|
173
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
174
|
* @param description the TaxonDescription with all the data
|
175
|
* @param language the language in which the description has to be printed
|
176
|
*
|
177
|
* @return a list of TextData, each one being a basic element of the natural language description
|
178
|
*/
|
179
|
@Override
|
180
|
public List<TextData> generateNaturalLanguageDescription(TermTree featureTree, TaxonDescription description, Language language) {
|
181
|
List<Language> languages = new ArrayList<Language>();
|
182
|
languages.add(language);
|
183
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
184
|
return generatePreferredNaturalLanguageDescription(featureTree,description,languages);
|
185
|
}
|
186
|
|
187
|
/**
|
188
|
* Generate a description with a specified list of preferred languages.
|
189
|
*
|
190
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
191
|
* @param description the TaxonDescription with all the data
|
192
|
* @param languages the ordered list of languages preferred for printing the description
|
193
|
*
|
194
|
* @return a list of TextData, each one being a basic element of the natural language description
|
195
|
*/
|
196
|
@Override
|
197
|
public List<TextData> generatePreferredNaturalLanguageDescription(TermTree featureTree,TaxonDescription description, List<Language> languages) {
|
198
|
initNaturalLanguageDescriptionElementProcessors(description.getAnnotations());
|
199
|
return buildBranchesDescr(featureTree.getRootChildren(), featureTree.getRoot(), description, languages,0);
|
200
|
}
|
201
|
|
202
|
/**
|
203
|
* Generate a description as a single paragraph in a TextData.
|
204
|
*
|
205
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
206
|
* @param description the TaxonDescription with all the data
|
207
|
*
|
208
|
* @return a TextData in the default language.
|
209
|
*/
|
210
|
@Override
|
211
|
public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description) {
|
212
|
return generateSingleTextData(featureTree,description,Language.DEFAULT());
|
213
|
}
|
214
|
|
215
|
/**
|
216
|
* Generate a description as a single paragraph in a TextData.
|
217
|
*
|
218
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
219
|
* @param description the TaxonDescription with all the data
|
220
|
* @param language the language in which the description has to be printed
|
221
|
*
|
222
|
* @return a TextData in the specified language.
|
223
|
*/
|
224
|
@Override
|
225
|
public TextData generateSingleTextData(TermTree featureTree, TaxonDescription description, Language language) {
|
226
|
List<Language> languages = new ArrayList<Language>();
|
227
|
languages.add(language);
|
228
|
return generatePreferredSingleTextData(featureTree,description,languages);
|
229
|
}
|
230
|
|
231
|
/**
|
232
|
* Generate a description with a specified list of preferred languages.
|
233
|
*
|
234
|
* @param featureTree the FeatureTree holding the order in which features and their states must be printed
|
235
|
* @param description the TaxonDescription with all the data
|
236
|
* @param languages the ordered list of languages preferred for printing the description
|
237
|
*
|
238
|
* @return a TextData using the languages (in the given order of preference)
|
239
|
*/
|
240
|
@Override
|
241
|
public TextData generatePreferredSingleTextData(TermTree featureTree, TaxonDescription description, List<Language> languages) {
|
242
|
levels.clear(); // before the start, the table containing the levels of each node must be cleared
|
243
|
// Note: this is not the most efficient way to keep track of the levels of the nodes but it allows some flexibility
|
244
|
List<TextData> texts = generatePreferredNaturalLanguageDescription(featureTree,description, languages);// first get the description as a raw list of TextData
|
245
|
|
246
|
StringBuilder descriptionStringBuilder = new StringBuilder(); // the StringBuilder used to generate the description
|
247
|
int i = 0,j,level; // i is used to store the index of the TextData to use
|
248
|
boolean startSentence = false, firstOne = true;
|
249
|
|
250
|
for (j=0 ; j<levels.size() ; j++){
|
251
|
level = levels.get(j);
|
252
|
if (level==-1){
|
253
|
if ((j+1)<levels.size() && levels.get(j+1).equals(0)){ // if this node is the direct father of a leaf
|
254
|
descriptionStringBuilder.append(secondSeparator + " ");
|
255
|
startSentence=true;
|
256
|
firstOne=false;
|
257
|
String asString = texts.get(i).getText(Language.DEFAULT()).toString();
|
258
|
if (asString.length()>1) {
|
259
|
descriptionStringBuilder.append(asString.substring(0,1).toUpperCase() + asString.substring(1));
|
260
|
}
|
261
|
}
|
262
|
i++;
|
263
|
}
|
264
|
else if (level==0) { // if this node is a leaf
|
265
|
if (startSentence) {
|
266
|
descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
267
|
} else {
|
268
|
descriptionStringBuilder.append(firstSeparator + texts.get(i).getText(Language.DEFAULT()));
|
269
|
}
|
270
|
startSentence=false;
|
271
|
i++;
|
272
|
}
|
273
|
else {
|
274
|
if (!firstOne && levels.get(j-1).equals(0)){ // if this node corresponds to the states linked to the previous leaf
|
275
|
if (i<texts.size()) {
|
276
|
descriptionStringBuilder.append(texts.get(i).getText(Language.DEFAULT()));
|
277
|
}
|
278
|
i++;
|
279
|
}
|
280
|
}
|
281
|
}
|
282
|
descriptionStringBuilder.append(secondSeparator);
|
283
|
String returnString = descriptionStringBuilder.toString();
|
284
|
returnString = StringUtils.replace(returnString, " ", " ");
|
285
|
returnString = StringUtils.removeStart(returnString, secondSeparator + " ");
|
286
|
return TextData.NewInstance(returnString,Language.DEFAULT(),TextFormat.NewInstance("", "Text", ""));
|
287
|
}
|
288
|
|
289
|
|
290
|
|
291
|
/** recursive function that goes through a tree containing the order in which the description has to be generated,
|
292
|
* if an element of this tree matches one of the TaxonDescription, a DescriptionBuilder is called which returns a TextData with the corresponding description.
|
293
|
*
|
294
|
* @param children the children of the feature node considered
|
295
|
* @param parent the feature node considered
|
296
|
* @param description the TaxonDescription element for which we want a natural language output
|
297
|
* @param language The language in which the description has to be written
|
298
|
* @param floor integer to keep track of the level in the tree
|
299
|
* @return a list of TextData elements containing the part of description corresponding to the feature node considered
|
300
|
*/
|
301
|
private List<TextData> buildBranchesDescr(List<TermNode> children, TermNode<Feature> parent, TaxonDescription description, List<Language> languages, int floor) {
|
302
|
List<TextData> listTextData = new ArrayList<TextData>();
|
303
|
floor++; // counter to know the current level in the tree
|
304
|
|
305
|
if (!parent.isLeaf()){ // if this node is not a leaf, continue recursively (only the leaves of a FeatureTree contain states)
|
306
|
levels.add(new Integer(floor)); // the level of the different nodes in the tree are kept, thus it is easier to build a structured text out of the List<TextData>
|
307
|
Feature feature = parent.getTerm();
|
308
|
TextData featureName;
|
309
|
if (feature!=null && feature.getLabel()!=null){ // if a node is associated to a feature
|
310
|
featureName = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
311
|
levels.add(new Integer(-1)); // it is indicated by a '-1' after its level
|
312
|
listTextData.add(featureName); // the TextData representing the name of the feature is concatenated to the list
|
313
|
}
|
314
|
else {
|
315
|
featureName = new TextData(); // else an empty TextData is created (because we keep track of the features, it is useful to inform when the upper node has no feature attached)
|
316
|
}
|
317
|
|
318
|
for (Iterator<TermNode> ifn = children.iterator() ; ifn.hasNext() ;){
|
319
|
previousTextData = featureName; // this allows to keep track of the name of the feature one level up in the tree
|
320
|
TermNode fn = ifn.next();
|
321
|
listTextData.addAll(buildBranchesDescr(fn.getChildNodes(),fn,description, languages, floor));
|
322
|
}
|
323
|
}
|
324
|
else { //once a leaf is reached
|
325
|
Feature feature = parent.getTerm();
|
326
|
if (feature!=null && (feature.isSupportsQuantitativeData() || feature.isSupportsCategoricalData())) {
|
327
|
Set<DescriptionElementBase> elements = description.getElements();
|
328
|
for (Iterator<DescriptionElementBase> deb = elements.iterator() ; deb.hasNext() ;){ // iterates over all the descriptions enclosed in the TaxonDescription
|
329
|
DescriptionElementBase descriptionElement = deb.next();
|
330
|
if (descriptionElement.getFeature().equals(feature)){ // if one matches the corresponding feature associated to this leaf
|
331
|
if (descriptionElement instanceof CategoricalData || descriptionElement instanceof QuantitativeData){
|
332
|
TextData featureTextData;
|
333
|
TextData statesTextData;
|
334
|
if (descriptionElement instanceof CategoricalData) { // if this description is a CategoricalData, generate the according TextData
|
335
|
CategoricalData categoricalData = (CategoricalData) descriptionElement;
|
336
|
statesTextData = categoricalDescriptionBuilder.build(categoricalData, languages);
|
337
|
featureTextData = categoricalDescriptionBuilder.buildTextDataFeature(feature, languages);
|
338
|
}
|
339
|
else { // if this description is a QuantitativeData, generate the according TextData
|
340
|
QuantitativeData quantitativeData = (QuantitativeData) descriptionElement;
|
341
|
statesTextData = quantitativeDescriptionBuilder.build(quantitativeData, languages);
|
342
|
featureTextData = quantitativeDescriptionBuilder.buildTextDataFeature(feature, languages);
|
343
|
}
|
344
|
applyNaturalLanguageDescriptionElementProcessors(featureTextData, previousTextData);
|
345
|
levels.add(new Integer(0)); // 0 indicates a feature, which is a leaf of the tree
|
346
|
listTextData.add(featureTextData);
|
347
|
levels.add(new Integer(floor)); // this represents the level of the feature and means it is followed by a TextData containing the states of the feature
|
348
|
listTextData.add(statesTextData);
|
349
|
}
|
350
|
}
|
351
|
}
|
352
|
}
|
353
|
}
|
354
|
return listTextData;
|
355
|
}
|
356
|
|
357
|
}
|