ebf1a5d171e9208fc7b79a6a3bfe0f254cac47fb
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / description / StructuredDescriptionAggregation.java
1 /**
2 * Copyright (C) 2019 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.api.service.description;
10
11 import java.math.BigDecimal;
12 import java.math.MathContext;
13 import java.util.ArrayList;
14 import java.util.Comparator;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Map.Entry;
20 import java.util.Set;
21 import java.util.stream.Collectors;
22
23 import eu.etaxonomy.cdm.common.BigDecimalUtil;
24 import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
25 import eu.etaxonomy.cdm.model.common.CdmBase;
26 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
27 import eu.etaxonomy.cdm.model.description.CategoricalData;
28 import eu.etaxonomy.cdm.model.description.DescriptionBase;
29 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
30 import eu.etaxonomy.cdm.model.description.DescriptionType;
31 import eu.etaxonomy.cdm.model.description.DescriptiveDataSet;
32 import eu.etaxonomy.cdm.model.description.Feature;
33 import eu.etaxonomy.cdm.model.description.IndividualsAssociation;
34 import eu.etaxonomy.cdm.model.description.QuantitativeData;
35 import eu.etaxonomy.cdm.model.description.SpecimenDescription;
36 import eu.etaxonomy.cdm.model.description.State;
37 import eu.etaxonomy.cdm.model.description.StateData;
38 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
39 import eu.etaxonomy.cdm.model.description.TaxonDescription;
40 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
41 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
42 import eu.etaxonomy.cdm.model.taxon.Taxon;
43 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
44
45 /**
46 * Aggregates the character data for a given {@link DescriptiveDataSet}.<br>
47 * <br>
48 * For all {@link SpecimenDescription}s belonging to this data set a new
49 * aggregated {@link TaxonDescription}s are created for every taxon the
50 * specimens are directly associated with.<BR>
51 * Also lower rank taxon descriptions are aggregated to upper rank taxa.
52 *
53 * @author a.mueller
54 * @author p.plitzner
55 * @since 03.11.2019
56 */
57 public class StructuredDescriptionAggregation
58 extends DescriptionAggregationBase<StructuredDescriptionAggregation, StructuredDescriptionAggregationConfiguration>{
59
60 private DescriptiveDataSet dataSet;
61
62 @Override
63 protected String pluralDataType(){
64 return "structured descriptive data";
65 }
66
67 @Override
68 protected void preAggregate(IProgressMonitor monitor) {
69 monitor.subTask("preAccumulate - nothing to do");
70
71 // take start time for performance testing
72 double start = System.currentTimeMillis();
73
74 getResult().setCdmEntity(getDescriptiveDatasetService().load(getConfig().getDatasetUuid()));
75
76 double end1 = System.currentTimeMillis();
77 logger.info("Time elapsed for pre-accumulate() : " + (end1 - start) / (1000) + "s");
78 }
79
80
81 private boolean hasCharacterData(DescriptionElementBase element) {
82 return hasCategoricalData(element) || hasQuantitativeData(element);
83 }
84
85 private boolean hasQuantitativeData(DescriptionElementBase element) {
86 if(element instanceof QuantitativeData
87 && !((QuantitativeData) element).getStatisticalValues().isEmpty()){
88 QuantitativeData quantitativeData = (QuantitativeData)element;
89 return !getExactValues(quantitativeData).isEmpty()
90 || quantitativeData.getMin()!=null
91 || quantitativeData.getMax()!=null;
92 }
93 return false;
94 }
95
96 private boolean hasCategoricalData(DescriptionElementBase element) {
97 return element instanceof CategoricalData && !((CategoricalData) element).getStatesOnly().isEmpty();
98 }
99
100 @Override
101 protected void setDescriptionTitle(TaxonDescription description, Taxon taxon) {
102 String title = taxon.getName() != null? taxon.getName().getTitleCache() : taxon.getTitleCache();
103 description.setTitleCache("Aggregated description for " + title, true);
104 return;
105 }
106
107 @Override
108 protected TaxonDescription createNewDescription(Taxon taxon) {
109 String title = taxon.getTitleCache();
110 logger.debug("creating new description for " + title);
111 TaxonDescription description = TaxonDescription.NewInstance(taxon);
112 description.addType(DescriptionType.AGGREGATED_STRUC_DESC);
113 setDescriptionTitle(description, taxon);
114 return description;
115 }
116
117 @Override
118 protected boolean hasDescriptionType(TaxonDescription description) {
119 return dataSet.getDescriptions().contains(description) && description.isAggregatedStructuredDescription();
120 }
121
122 @Override
123 protected List<String> descriptionInitStrategy() {
124 return new ArrayList<>();
125 }
126
127 @Override
128 protected void addAggregationResultToDescription(TaxonDescription targetDescription,
129 ResultHolder resultHolder) {
130 StructuredDescriptionResultHolder structuredResultHolder = (StructuredDescriptionResultHolder)resultHolder;
131
132 replaceExistingDescriptionElements(targetDescription, structuredResultHolder.categoricalMap);
133 replaceExistingDescriptionElements(targetDescription, structuredResultHolder.quantitativeMap);
134 addAggregationSources(targetDescription, structuredResultHolder);
135
136 if(!targetDescription.getElements().isEmpty()){
137 dataSet.addDescription(targetDescription);
138 }
139 }
140
141 private void addAggregationSources(TaxonDescription targetDescription,
142 StructuredDescriptionResultHolder structuredResultHolder) {
143 //FIXME Re-use sources if possible
144 //Remove sources from description
145 Set<IdentifiableSource> sourcesToRemove = targetDescription.getSources().stream()
146 .filter(source->source.getType().equals(OriginalSourceType.Aggregation))
147 .collect(Collectors.toSet());
148
149 for (IdentifiableSource source : sourcesToRemove) {
150 targetDescription.removeSource(source);
151 }
152
153 Set<DescriptionBase<?>> sourceDescriptions = structuredResultHolder.sourceDescriptions;
154 for (DescriptionBase<?> descriptionBase : sourceDescriptions) {
155 DescriptionBase<?> sourceDescription = null;
156 if(descriptionBase.isInstanceOf(SpecimenDescription.class)){
157 DescriptionBase<?> clone = descriptionBase.clone();
158 clone.removeDescriptiveDataSet(dataSet);
159 clone.getTypes().add(DescriptionType.CLONE_FOR_SOURCE);
160 SpecimenOrObservationBase<?> specimen = CdmBase.deproxy(descriptionBase, SpecimenDescription.class).getDescribedSpecimenOrObservation();
161 specimen.addDescription(CdmBase.deproxy(clone, SpecimenDescription.class));
162 sourceDescription=clone;
163 }
164 else if(descriptionBase.isInstanceOf(TaxonDescription.class)){
165 Taxon taxon = CdmBase.deproxy(descriptionBase, TaxonDescription.class).getTaxon();
166 taxon.addDescription(CdmBase.deproxy(descriptionBase, TaxonDescription.class));
167 sourceDescription=descriptionBase;
168 }
169 if(sourceDescription!=null){
170 targetDescription.addAggregationSource(sourceDescription);
171 }
172 }
173 }
174
175 private void replaceExistingDescriptionElements(TaxonDescription targetDescription,
176 Map<Feature, ? extends DescriptionElementBase> elementMap) {
177 for (Entry<Feature, ? extends DescriptionElementBase> entry : elementMap.entrySet()) {
178 DescriptionElementBase elementToRemove = null;
179 DescriptionElementBase elementReplacement = null;
180 for (DescriptionElementBase descriptionElementBase : targetDescription.getElements()) {
181 if(descriptionElementBase.getFeature().equals(entry.getKey())){
182 elementToRemove = descriptionElementBase;
183 elementReplacement = entry.getValue();
184 break;
185 }
186 }
187 if(elementToRemove!=null && elementReplacement!=null){
188 targetDescription.removeElement(elementToRemove);
189 targetDescription.addElement(elementReplacement);
190 }
191 else{
192 targetDescription.addElement(entry.getValue());
193 }
194 }
195 }
196
197 @Override
198 protected void initTransaction() {
199 dataSet = getDescriptiveDatasetService().load(getConfig().getDatasetUuid());
200 }
201
202 @Override
203 protected void removeDescriptionIfEmpty(TaxonDescription description) {
204 super.removeDescriptionIfEmpty(description);
205 if (description.getElements().isEmpty()){
206 dataSet.removeDescription(description);
207 }
208 }
209
210 @Override
211 protected void aggregateToParentTaxon(TaxonNode taxonNode,
212 ResultHolder resultHolder,
213 Set<TaxonDescription> excludedDescriptions) {
214 StructuredDescriptionResultHolder descriptiveResultHolder = (StructuredDescriptionResultHolder)resultHolder;
215 Set<TaxonDescription> childDescriptions = getChildTaxonDescriptions(taxonNode, dataSet);
216 addDescriptionElement(descriptiveResultHolder, childDescriptions);
217 }
218
219 @Override
220 protected void aggregateWithinSingleTaxon(Taxon taxon,
221 ResultHolder resultHolder,
222 Set<TaxonDescription> excludedDescriptions) {
223 StructuredDescriptionResultHolder descriptiveResultHolder = (StructuredDescriptionResultHolder)resultHolder;
224 Set<SpecimenDescription> specimenDescriptions = getSpecimenDescriptions(taxon, dataSet);
225 addDescriptionElement(descriptiveResultHolder, specimenDescriptions);
226 Set<TaxonDescription> literatureDescriptions = getLiteratureDescriptions(taxon, dataSet);
227 addDescriptionElement(descriptiveResultHolder, literatureDescriptions);
228 //TODO add defaultDescriptions
229
230 }
231
232 private void addDescriptionElement(StructuredDescriptionResultHolder descriptiveResultHolder,
233 Set<? extends DescriptionBase<?>> descriptions) {
234 boolean descriptionWasUsed = false;
235 for (DescriptionBase<?> desc: descriptions){
236 for (DescriptionElementBase deb: desc.getElements()){
237 if (hasCharacterData(deb)){
238 if (deb.isInstanceOf(CategoricalData.class)){
239 addToCategorical(CdmBase.deproxy(deb, CategoricalData.class), descriptiveResultHolder);
240 descriptionWasUsed = true;
241 }else if (deb.isInstanceOf(QuantitativeData.class)){
242 addToQuantitative(CdmBase.deproxy(deb, QuantitativeData.class), descriptiveResultHolder);
243 descriptionWasUsed = true;
244 }
245 }
246 }
247 if(descriptionWasUsed){
248 descriptiveResultHolder.sourceDescriptions.add(desc);
249 }
250 }
251 }
252
253 private void addToQuantitative(QuantitativeData qd, StructuredDescriptionResultHolder resultHolder) {
254 QuantitativeData aggregatedQuantitativeData = resultHolder.quantitativeMap.get(qd.getFeature());
255 if(aggregatedQuantitativeData==null){
256 // no QuantitativeData with this feature in aggregation
257 aggregatedQuantitativeData = aggregateSingleQuantitativeData(qd);
258 }
259 else{
260 aggregatedQuantitativeData = mergeQuantitativeData(aggregatedQuantitativeData, qd);
261 }
262 if (aggregatedQuantitativeData != null){
263 resultHolder.quantitativeMap.put(qd.getFeature(), aggregatedQuantitativeData);
264 }
265 }
266
267 private void addToCategorical(CategoricalData cd, StructuredDescriptionResultHolder resultHolder) {
268 CategoricalData aggregatedCategoricalData = resultHolder.categoricalMap.get(cd.getFeature());
269 if(aggregatedCategoricalData==null){
270 // no CategoricalData with this feature in aggregation
271 aggregatedCategoricalData = cd.clone();
272 // set count to 1 if not set
273 aggregatedCategoricalData.getStateData().stream().filter(sd->sd.getCount()==null).forEach(sd->sd.incrementCount());
274 resultHolder.categoricalMap.put(aggregatedCategoricalData.getFeature(), aggregatedCategoricalData);
275 }
276 else{
277 // split all StateData into those where the state already exists and those where it doesn't
278 List<State> statesOnly = aggregatedCategoricalData.getStatesOnly();
279 List<StateData> sdWithExistingStateInAggregation = cd.getStateData().stream().filter(sd->statesOnly.contains(sd.getState())).collect(Collectors.toList());
280 List<StateData> sdWithNoExistingStateInAggregation = cd.getStateData().stream().filter(sd->!statesOnly.contains(sd.getState())).collect(Collectors.toList());
281
282 for (StateData sd : sdWithNoExistingStateInAggregation) {
283 StateData clone = sd.clone();
284 // set count to 1 if not set
285 if(clone.getCount()==null){
286 clone.incrementCount();
287 }
288 aggregatedCategoricalData.addStateData(clone);
289 }
290
291 for (StateData sdExist : sdWithExistingStateInAggregation) {
292 List<StateData> aggregatedSameStateData = aggregatedCategoricalData.getStateData().stream()
293 .filter(sd->hasSameState(sdExist, sd))
294 .collect(Collectors.toList());
295 for (StateData stateData : aggregatedSameStateData) {
296 if(sdExist.getCount()==null){
297 stateData.incrementCount();
298 }
299 else{
300 stateData.setCount(stateData.getCount()+sdExist.getCount());
301 }
302 }
303 }
304 }
305 }
306
307 @Override
308 protected StructuredDescriptionResultHolder createResultHolder() {
309 return new StructuredDescriptionResultHolder();
310 }
311
312 private class StructuredDescriptionResultHolder implements ResultHolder{
313 Map<Feature, CategoricalData> categoricalMap = new HashMap<>();
314 Map<Feature, QuantitativeData> quantitativeMap = new HashMap<>();
315 Set<DescriptionBase<?>> sourceDescriptions = new HashSet<>();
316 @Override
317 public String toString() {
318 return "SDResultHolder [categoricals=" + categoricalMap.size() + ", quantitatives="
319 + quantitativeMap.size() + ", sourceDescriptions=" + sourceDescriptions.size() + "]";
320 }
321 }
322
323 private Set<TaxonDescription> getChildTaxonDescriptions(TaxonNode taxonNode, DescriptiveDataSet dataSet) {
324 Set<TaxonDescription> result = new HashSet<>();
325 List<TaxonNode> childNodes = taxonNode.getChildNodes();
326 for (TaxonNode childNode : childNodes) {
327 Set<TaxonDescription> childDescriptions = childNode.getTaxon().getDescriptions();
328 result.addAll(childDescriptions.stream()
329 .filter(desc->desc.getTypes().contains(DescriptionType.AGGREGATED_STRUC_DESC))
330 .filter(desc->dataSet.getDescriptions().contains(desc))
331 .collect(Collectors.toSet()));
332 }
333 return result;
334 }
335
336 private Set<SpecimenDescription> getSpecimenDescriptions(Taxon taxon, DescriptiveDataSet dataSet) {
337 Set<SpecimenDescription> result = new HashSet<>();
338 //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
339 for (TaxonDescription taxonDesc: taxon.getDescriptions()){
340 for (DescriptionElementBase taxonDeb : taxonDesc.getElements()){
341 if (taxonDeb.isInstanceOf(IndividualsAssociation.class)){
342 IndividualsAssociation indAss = CdmBase.deproxy(taxonDeb, IndividualsAssociation.class);
343 SpecimenOrObservationBase<?> specimen = indAss.getAssociatedSpecimenOrObservation();
344 @SuppressWarnings({ "unchecked", "rawtypes" })
345 Set<SpecimenDescription> descriptions = (Set)specimen.getDescriptions();
346 for(SpecimenDescription specimenDescription : descriptions){
347 if(dataSet.getDescriptions().contains(specimenDescription) &&
348 specimenDescription.getTypes().stream().noneMatch(type->type.equals(DescriptionType.CLONE_FOR_SOURCE))){
349 result.add(specimenDescription);
350 }
351 }
352 }
353 }
354 }
355 return result;
356 }
357
358 private Set<TaxonDescription> getLiteratureDescriptions(Taxon taxon, DescriptiveDataSet dataSet) {
359 Set<TaxonDescription> result = new HashSet<>();
360 //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
361 for(TaxonDescription taxonDescription : taxon.getDescriptions()){
362 if(dataSet.getDescriptions().contains(taxonDescription)
363 && taxonDescription.getTypes().stream().anyMatch(type->type.equals(DescriptionType.SECONDARY_DATA))
364 && taxonDescription.getTypes().stream().noneMatch(type->type.equals(DescriptionType.CLONE_FOR_SOURCE)) ){
365 result.add(taxonDescription);
366 }
367 }
368 return result;
369 }
370
371 private QuantitativeData aggregateSingleQuantitativeData(QuantitativeData sourceQd){
372 QuantitativeData aggQD = QuantitativeData.NewInstance(sourceQd.getFeature());
373 Set<BigDecimal> exactValues = sourceQd.getExactValues();
374 if(!exactValues.isEmpty()){
375 Comparator<BigDecimal> comp = Comparator.naturalOrder();
376 // qd is not already aggregated
377 int exactValueSampleSize = exactValues.size();
378 BigDecimal exactValueMin = exactValues.stream().min(comp).get();
379 BigDecimal exactValueMax = exactValues.stream().max(comp).get();
380 BigDecimal exactValueAvg = BigDecimalUtil.average(exactValues);
381 //TODO also check for typical boundary data
382 if(sourceQd.getMin() == null && sourceQd.getMax() == null){
383 aggQD.setSampleSize(new BigDecimal(exactValueSampleSize), null);
384 aggQD.setAverage(exactValueAvg, null);
385 }
386 aggQD.setMinimum(sourceQd.getMin() == null ? exactValueMin: sourceQd.getMin().min(exactValueMin), null);
387 aggQD.setMaximum(sourceQd.getMax() == null ? exactValueMax: sourceQd.getMax().max(exactValueMax), null);
388 }
389 else{
390 // qd has only min, max, ... but no exact values
391 aggQD = sourceQd.clone();
392 aggQD = handleMissingValues(aggQD);
393 }
394 return aggQD;
395 }
396
397 private QuantitativeData handleMissingValues(QuantitativeData qd) {
398 //min max
399 qd = handleMissingMinOrMax(qd);
400 //average
401 if (qd != null && qd.getAverage() == null){
402 BigDecimal n = qd.getSampleSize();
403 if(n != null && !n.equals(0f)){
404 BigDecimal average = (qd.getMax().add(qd.getMin())).divide(n);
405 qd.setAverage(average, null);
406 }
407 }
408 return qd;
409 }
410
411 private QuantitativeData handleMissingMinOrMax(QuantitativeData qd) {
412 return handleMissingMinOrMax(qd, getConfig().getMissingMinimumMode(), getConfig().getMissingMaximumMode());
413 }
414
415 public static QuantitativeData handleMissingMinOrMax(QuantitativeData aggQD, MissingMinimumMode missingMinMode,
416 MissingMaximumMode missingMaxMode) {
417 if(aggQD.getMin() == null && aggQD.getMax() != null){
418 if (missingMinMode == MissingMinimumMode.MinToZero) {
419 aggQD.setMinimum(BigDecimal.valueOf(0f), null);
420 }else if (missingMinMode == MissingMinimumMode.MinToMax){
421 aggQD.setMinimum(aggQD.getMax(), null);
422 }else if (missingMinMode == MissingMinimumMode.SkipRecord){
423 return null;
424 }
425 }
426 if(aggQD.getMax() == null && aggQD.getMin() != null){
427 if (missingMaxMode == MissingMaximumMode.MaxToMin){
428 aggQD.setMaximum(aggQD.getMin(), null);
429 }else if (missingMaxMode == MissingMaximumMode.SkipRecord){
430 return null;
431 }
432 }
433 return aggQD;
434 }
435
436 private QuantitativeData mergeQuantitativeData(QuantitativeData aggQd, QuantitativeData newQd) {
437
438 newQd = aggregateSingleQuantitativeData(newQd); //alternatively we could check, if newQd is already basically aggregated, but for this we need a clear definition what the minimum requirements are and how ExactValues and MinMax if existing in parallel should be handled.
439
440 BigDecimal min = null;
441 BigDecimal max = null;
442 BigDecimal average = null;
443 BigDecimal sampleSize = null;
444 newQd = handleMissingValues(newQd);
445 if (newQd == null){
446 return aggQd;
447 }
448 min = aggQd.getMin().min(newQd.getMin());
449 max = aggQd.getMax().max(newQd.getMax());
450 if (newQd.getSampleSize() != null && aggQd.getSampleSize() != null){
451 sampleSize = newQd.getSampleSize().add(aggQd.getSampleSize());
452 }
453 if (sampleSize != null && !sampleSize.equals(0f) && aggQd.getAverage() != null && newQd.getAverage() != null){
454 BigDecimal aggTotalSum = aggQd.getAverage().multiply(aggQd.getSampleSize(), MathContext.DECIMAL32);
455 BigDecimal newTotalSum = newQd.getAverage().multiply(newQd.getSampleSize(), MathContext.DECIMAL32);
456 BigDecimal totalSum = aggTotalSum.add(newTotalSum);
457 average = totalSum.divide(sampleSize, MathContext.DECIMAL32).stripTrailingZeros(); //to be discussed if we really want to reduce precision here, however, due to the current way to compute average we do not have exact precision anyway
458 }
459 aggQd.setMinimum(min, null);
460 aggQd.setMaximum(max, null);
461 aggQd.setSampleSize(sampleSize, null);
462 aggQd.setAverage(average, null);
463 return aggQd;
464 }
465
466 private static List<BigDecimal> getExactValues(QuantitativeData qd) {
467 List<BigDecimal> exactValues = qd.getStatisticalValues().stream()
468 .filter(value->value.getType().equals(StatisticalMeasure.EXACT_VALUE()))
469 .map(exact->exact.getValue())
470 .collect(Collectors.toList());
471 return exactValues;
472 }
473
474 private static boolean hasSameState(StateData sd1, StateData sd2) {
475 return sd2.getState().getUuid().equals(sd1.getState().getUuid());
476 }
477 }