2 * Copyright (C) 2019 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
9 package eu
.etaxonomy
.cdm
.api
.service
.description
;
11 import java
.math
.BigDecimal
;
12 import java
.math
.MathContext
;
13 import java
.util
.ArrayList
;
14 import java
.util
.Comparator
;
15 import java
.util
.HashMap
;
16 import java
.util
.HashSet
;
17 import java
.util
.List
;
19 import java
.util
.Map
.Entry
;
21 import java
.util
.stream
.Collectors
;
23 import eu
.etaxonomy
.cdm
.common
.BigDecimalUtil
;
24 import eu
.etaxonomy
.cdm
.common
.monitor
.IProgressMonitor
;
25 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
26 import eu
.etaxonomy
.cdm
.model
.common
.IdentifiableSource
;
27 import eu
.etaxonomy
.cdm
.model
.description
.CategoricalData
;
28 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
29 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
30 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionType
;
31 import eu
.etaxonomy
.cdm
.model
.description
.DescriptiveDataSet
;
32 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
33 import eu
.etaxonomy
.cdm
.model
.description
.IndividualsAssociation
;
34 import eu
.etaxonomy
.cdm
.model
.description
.QuantitativeData
;
35 import eu
.etaxonomy
.cdm
.model
.description
.SpecimenDescription
;
36 import eu
.etaxonomy
.cdm
.model
.description
.State
;
37 import eu
.etaxonomy
.cdm
.model
.description
.StateData
;
38 import eu
.etaxonomy
.cdm
.model
.description
.StatisticalMeasure
;
39 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
40 import eu
.etaxonomy
.cdm
.model
.occurrence
.SpecimenOrObservationBase
;
41 import eu
.etaxonomy
.cdm
.model
.reference
.OriginalSourceType
;
42 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
43 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonNode
;
46 * Aggregates the character data for a given {@link DescriptiveDataSet}.<br>
48 * For all {@link SpecimenDescription}s belonging to this data set a new
49 * aggregated {@link TaxonDescription}s are created for every taxon the
50 * specimens are directly associated with.<BR>
51 * Also lower rank taxon descriptions are aggregated to upper rank taxa.
57 public class StructuredDescriptionAggregation
58 extends DescriptionAggregationBase
<StructuredDescriptionAggregation
, StructuredDescriptionAggregationConfiguration
>{
60 private DescriptiveDataSet dataSet
;
63 protected String
pluralDataType(){
64 return "structured descriptive data";
68 protected void preAggregate(IProgressMonitor monitor
) {
69 monitor
.subTask("preAccumulate - nothing to do");
71 // take start time for performance testing
72 double start
= System
.currentTimeMillis();
74 getResult().setCdmEntity(getDescriptiveDatasetService().load(getConfig().getDatasetUuid()));
76 double end1
= System
.currentTimeMillis();
77 logger
.info("Time elapsed for pre-accumulate() : " + (end1
- start
) / (1000) + "s");
81 private boolean hasCharacterData(DescriptionElementBase element
) {
82 return hasCategoricalData(element
) || hasQuantitativeData(element
);
85 private boolean hasQuantitativeData(DescriptionElementBase element
) {
86 if(element
instanceof QuantitativeData
87 && !((QuantitativeData
) element
).getStatisticalValues().isEmpty()){
88 QuantitativeData quantitativeData
= (QuantitativeData
)element
;
89 return !getExactValues(quantitativeData
).isEmpty()
90 || quantitativeData
.getMin()!=null
91 || quantitativeData
.getMax()!=null;
96 private boolean hasCategoricalData(DescriptionElementBase element
) {
97 return element
instanceof CategoricalData
&& !((CategoricalData
) element
).getStatesOnly().isEmpty();
101 protected void setDescriptionTitle(TaxonDescription description
, Taxon taxon
) {
102 String title
= taxon
.getName() != null? taxon
.getName().getTitleCache() : taxon
.getTitleCache();
103 description
.setTitleCache("Aggregated description for " + title
, true);
108 protected TaxonDescription
createNewDescription(Taxon taxon
) {
109 String title
= taxon
.getTitleCache();
110 logger
.debug("creating new description for " + title
);
111 TaxonDescription description
= TaxonDescription
.NewInstance(taxon
);
112 description
.addType(DescriptionType
.AGGREGATED_STRUC_DESC
);
113 setDescriptionTitle(description
, taxon
);
118 protected boolean hasDescriptionType(TaxonDescription description
) {
119 return dataSet
.getDescriptions().contains(description
) && description
.isAggregatedStructuredDescription();
123 protected List
<String
> descriptionInitStrategy() {
124 return new ArrayList
<>();
128 protected void addAggregationResultToDescription(TaxonDescription targetDescription
,
129 ResultHolder resultHolder
) {
130 StructuredDescriptionResultHolder structuredResultHolder
= (StructuredDescriptionResultHolder
)resultHolder
;
132 replaceExistingDescriptionElements(targetDescription
, structuredResultHolder
.categoricalMap
);
133 replaceExistingDescriptionElements(targetDescription
, structuredResultHolder
.quantitativeMap
);
134 addAggregationSources(targetDescription
, structuredResultHolder
);
136 if(!targetDescription
.getElements().isEmpty()){
137 dataSet
.addDescription(targetDescription
);
141 private void addAggregationSources(TaxonDescription targetDescription
,
142 StructuredDescriptionResultHolder structuredResultHolder
) {
143 //FIXME Re-use sources if possible
144 //Remove sources from description
145 Set
<IdentifiableSource
> sourcesToRemove
= targetDescription
.getSources().stream()
146 .filter(source
->source
.getType().equals(OriginalSourceType
.Aggregation
))
147 .collect(Collectors
.toSet());
149 for (IdentifiableSource source
: sourcesToRemove
) {
150 targetDescription
.removeSource(source
);
153 Set
<DescriptionBase
<?
>> sourceDescriptions
= structuredResultHolder
.sourceDescriptions
;
154 for (DescriptionBase
<?
> descriptionBase
: sourceDescriptions
) {
155 DescriptionBase
<?
> sourceDescription
= null;
156 if(descriptionBase
.isInstanceOf(SpecimenDescription
.class)){
157 DescriptionBase
<?
> clone
= descriptionBase
.clone();
158 clone
.removeDescriptiveDataSet(dataSet
);
159 clone
.getTypes().add(DescriptionType
.CLONE_FOR_SOURCE
);
160 SpecimenOrObservationBase
<?
> specimen
= CdmBase
.deproxy(descriptionBase
, SpecimenDescription
.class).getDescribedSpecimenOrObservation();
161 specimen
.addDescription(CdmBase
.deproxy(clone
, SpecimenDescription
.class));
162 sourceDescription
=clone
;
164 else if(descriptionBase
.isInstanceOf(TaxonDescription
.class)){
165 Taxon taxon
= CdmBase
.deproxy(descriptionBase
, TaxonDescription
.class).getTaxon();
166 taxon
.addDescription(CdmBase
.deproxy(descriptionBase
, TaxonDescription
.class));
167 sourceDescription
=descriptionBase
;
169 if(sourceDescription
!=null){
170 targetDescription
.addAggregationSource(sourceDescription
);
175 private void replaceExistingDescriptionElements(TaxonDescription targetDescription
,
176 Map
<Feature
, ?
extends DescriptionElementBase
> elementMap
) {
177 for (Entry
<Feature
, ?
extends DescriptionElementBase
> entry
: elementMap
.entrySet()) {
178 DescriptionElementBase elementToRemove
= null;
179 DescriptionElementBase elementReplacement
= null;
180 for (DescriptionElementBase descriptionElementBase
: targetDescription
.getElements()) {
181 if(descriptionElementBase
.getFeature().equals(entry
.getKey())){
182 elementToRemove
= descriptionElementBase
;
183 elementReplacement
= entry
.getValue();
187 if(elementToRemove
!=null && elementReplacement
!=null){
188 targetDescription
.removeElement(elementToRemove
);
189 targetDescription
.addElement(elementReplacement
);
192 targetDescription
.addElement(entry
.getValue());
198 protected void initTransaction() {
199 dataSet
= getDescriptiveDatasetService().load(getConfig().getDatasetUuid());
203 protected void removeDescriptionIfEmpty(TaxonDescription description
) {
204 super.removeDescriptionIfEmpty(description
);
205 if (description
.getElements().isEmpty()){
206 dataSet
.removeDescription(description
);
211 protected void aggregateToParentTaxon(TaxonNode taxonNode
,
212 ResultHolder resultHolder
,
213 Set
<TaxonDescription
> excludedDescriptions
) {
214 StructuredDescriptionResultHolder descriptiveResultHolder
= (StructuredDescriptionResultHolder
)resultHolder
;
215 Set
<TaxonDescription
> childDescriptions
= getChildTaxonDescriptions(taxonNode
, dataSet
);
216 addDescriptionElement(descriptiveResultHolder
, childDescriptions
);
220 protected void aggregateWithinSingleTaxon(Taxon taxon
,
221 ResultHolder resultHolder
,
222 Set
<TaxonDescription
> excludedDescriptions
) {
223 StructuredDescriptionResultHolder descriptiveResultHolder
= (StructuredDescriptionResultHolder
)resultHolder
;
224 Set
<SpecimenDescription
> specimenDescriptions
= getSpecimenDescriptions(taxon
, dataSet
);
225 addDescriptionElement(descriptiveResultHolder
, specimenDescriptions
);
226 Set
<TaxonDescription
> literatureDescriptions
= getLiteratureDescriptions(taxon
, dataSet
);
227 addDescriptionElement(descriptiveResultHolder
, literatureDescriptions
);
228 //TODO add defaultDescriptions
232 private void addDescriptionElement(StructuredDescriptionResultHolder descriptiveResultHolder
,
233 Set
<?
extends DescriptionBase
<?
>> descriptions
) {
234 boolean descriptionWasUsed
= false;
235 for (DescriptionBase
<?
> desc
: descriptions
){
236 for (DescriptionElementBase deb
: desc
.getElements()){
237 if (hasCharacterData(deb
)){
238 if (deb
.isInstanceOf(CategoricalData
.class)){
239 addToCategorical(CdmBase
.deproxy(deb
, CategoricalData
.class), descriptiveResultHolder
);
240 descriptionWasUsed
= true;
241 }else if (deb
.isInstanceOf(QuantitativeData
.class)){
242 addToQuantitative(CdmBase
.deproxy(deb
, QuantitativeData
.class), descriptiveResultHolder
);
243 descriptionWasUsed
= true;
247 if(descriptionWasUsed
){
248 descriptiveResultHolder
.sourceDescriptions
.add(desc
);
253 private void addToQuantitative(QuantitativeData qd
, StructuredDescriptionResultHolder resultHolder
) {
254 QuantitativeData aggregatedQuantitativeData
= resultHolder
.quantitativeMap
.get(qd
.getFeature());
255 if(aggregatedQuantitativeData
==null){
256 // no QuantitativeData with this feature in aggregation
257 aggregatedQuantitativeData
= aggregateSingleQuantitativeData(qd
);
260 aggregatedQuantitativeData
= mergeQuantitativeData(aggregatedQuantitativeData
, qd
);
262 if (aggregatedQuantitativeData
!= null){
263 resultHolder
.quantitativeMap
.put(qd
.getFeature(), aggregatedQuantitativeData
);
267 private void addToCategorical(CategoricalData cd
, StructuredDescriptionResultHolder resultHolder
) {
268 CategoricalData aggregatedCategoricalData
= resultHolder
.categoricalMap
.get(cd
.getFeature());
269 if(aggregatedCategoricalData
==null){
270 // no CategoricalData with this feature in aggregation
271 aggregatedCategoricalData
= cd
.clone();
272 // set count to 1 if not set
273 aggregatedCategoricalData
.getStateData().stream().filter(sd
->sd
.getCount()==null).forEach(sd
->sd
.incrementCount());
274 resultHolder
.categoricalMap
.put(aggregatedCategoricalData
.getFeature(), aggregatedCategoricalData
);
277 // split all StateData into those where the state already exists and those where it doesn't
278 List
<State
> statesOnly
= aggregatedCategoricalData
.getStatesOnly();
279 List
<StateData
> sdWithExistingStateInAggregation
= cd
.getStateData().stream().filter(sd
->statesOnly
.contains(sd
.getState())).collect(Collectors
.toList());
280 List
<StateData
> sdWithNoExistingStateInAggregation
= cd
.getStateData().stream().filter(sd
->!statesOnly
.contains(sd
.getState())).collect(Collectors
.toList());
282 for (StateData sd
: sdWithNoExistingStateInAggregation
) {
283 StateData clone
= sd
.clone();
284 // set count to 1 if not set
285 if(clone
.getCount()==null){
286 clone
.incrementCount();
288 aggregatedCategoricalData
.addStateData(clone
);
291 for (StateData sdExist
: sdWithExistingStateInAggregation
) {
292 List
<StateData
> aggregatedSameStateData
= aggregatedCategoricalData
.getStateData().stream()
293 .filter(sd
->hasSameState(sdExist
, sd
))
294 .collect(Collectors
.toList());
295 for (StateData stateData
: aggregatedSameStateData
) {
296 if(sdExist
.getCount()==null){
297 stateData
.incrementCount();
300 stateData
.setCount(stateData
.getCount()+sdExist
.getCount());
308 protected StructuredDescriptionResultHolder
createResultHolder() {
309 return new StructuredDescriptionResultHolder();
312 private class StructuredDescriptionResultHolder
implements ResultHolder
{
313 Map
<Feature
, CategoricalData
> categoricalMap
= new HashMap
<>();
314 Map
<Feature
, QuantitativeData
> quantitativeMap
= new HashMap
<>();
315 Set
<DescriptionBase
<?
>> sourceDescriptions
= new HashSet
<>();
317 public String
toString() {
318 return "SDResultHolder [categoricals=" + categoricalMap
.size() + ", quantitatives="
319 + quantitativeMap
.size() + ", sourceDescriptions=" + sourceDescriptions
.size() + "]";
323 private Set
<TaxonDescription
> getChildTaxonDescriptions(TaxonNode taxonNode
, DescriptiveDataSet dataSet
) {
324 Set
<TaxonDescription
> result
= new HashSet
<>();
325 List
<TaxonNode
> childNodes
= taxonNode
.getChildNodes();
326 for (TaxonNode childNode
: childNodes
) {
327 Set
<TaxonDescription
> childDescriptions
= childNode
.getTaxon().getDescriptions();
328 result
.addAll(childDescriptions
.stream()
329 .filter(desc
->desc
.getTypes().contains(DescriptionType
.AGGREGATED_STRUC_DESC
))
330 .filter(desc
->dataSet
.getDescriptions().contains(desc
))
331 .collect(Collectors
.toSet()));
336 private Set
<SpecimenDescription
> getSpecimenDescriptions(Taxon taxon
, DescriptiveDataSet dataSet
) {
337 Set
<SpecimenDescription
> result
= new HashSet
<>();
338 //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
339 for (TaxonDescription taxonDesc
: taxon
.getDescriptions()){
340 for (DescriptionElementBase taxonDeb
: taxonDesc
.getElements()){
341 if (taxonDeb
.isInstanceOf(IndividualsAssociation
.class)){
342 IndividualsAssociation indAss
= CdmBase
.deproxy(taxonDeb
, IndividualsAssociation
.class);
343 SpecimenOrObservationBase
<?
> specimen
= indAss
.getAssociatedSpecimenOrObservation();
344 @SuppressWarnings({ "unchecked", "rawtypes" })
345 Set
<SpecimenDescription
> descriptions
= (Set
)specimen
.getDescriptions();
346 for(SpecimenDescription specimenDescription
: descriptions
){
347 if(dataSet
.getDescriptions().contains(specimenDescription
) &&
348 specimenDescription
.getTypes().stream().noneMatch(type
->type
.equals(DescriptionType
.CLONE_FOR_SOURCE
))){
349 result
.add(specimenDescription
);
358 private Set
<TaxonDescription
> getLiteratureDescriptions(Taxon taxon
, DescriptiveDataSet dataSet
) {
359 Set
<TaxonDescription
> result
= new HashSet
<>();
360 //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
361 for(TaxonDescription taxonDescription
: taxon
.getDescriptions()){
362 if(dataSet
.getDescriptions().contains(taxonDescription
)
363 && taxonDescription
.getTypes().stream().anyMatch(type
->type
.equals(DescriptionType
.SECONDARY_DATA
))
364 && taxonDescription
.getTypes().stream().noneMatch(type
->type
.equals(DescriptionType
.CLONE_FOR_SOURCE
)) ){
365 result
.add(taxonDescription
);
371 private QuantitativeData
aggregateSingleQuantitativeData(QuantitativeData sourceQd
){
372 QuantitativeData aggQD
= QuantitativeData
.NewInstance(sourceQd
.getFeature());
373 Set
<BigDecimal
> exactValues
= sourceQd
.getExactValues();
374 if(!exactValues
.isEmpty()){
375 Comparator
<BigDecimal
> comp
= Comparator
.naturalOrder();
376 // qd is not already aggregated
377 int exactValueSampleSize
= exactValues
.size();
378 BigDecimal exactValueMin
= exactValues
.stream().min(comp
).get();
379 BigDecimal exactValueMax
= exactValues
.stream().max(comp
).get();
380 BigDecimal exactValueAvg
= BigDecimalUtil
.average(exactValues
);
381 //TODO also check for typical boundary data
382 if(sourceQd
.getMin() == null && sourceQd
.getMax() == null){
383 aggQD
.setSampleSize(new BigDecimal(exactValueSampleSize
), null);
384 aggQD
.setAverage(exactValueAvg
, null);
386 aggQD
.setMinimum(sourceQd
.getMin() == null ? exactValueMin
: sourceQd
.getMin().min(exactValueMin
), null);
387 aggQD
.setMaximum(sourceQd
.getMax() == null ? exactValueMax
: sourceQd
.getMax().max(exactValueMax
), null);
390 // qd has only min, max, ... but no exact values
391 aggQD
= sourceQd
.clone();
392 aggQD
= handleMissingValues(aggQD
);
397 private QuantitativeData
handleMissingValues(QuantitativeData qd
) {
399 qd
= handleMissingMinOrMax(qd
);
401 if (qd
!= null && qd
.getAverage() == null){
402 BigDecimal n
= qd
.getSampleSize();
403 if(n
!= null && !n
.equals(0f
)){
404 BigDecimal average
= (qd
.getMax().add(qd
.getMin())).divide(n
);
405 qd
.setAverage(average
, null);
411 private QuantitativeData
handleMissingMinOrMax(QuantitativeData qd
) {
412 return handleMissingMinOrMax(qd
, getConfig().getMissingMinimumMode(), getConfig().getMissingMaximumMode());
415 public static QuantitativeData
handleMissingMinOrMax(QuantitativeData aggQD
, MissingMinimumMode missingMinMode
,
416 MissingMaximumMode missingMaxMode
) {
417 if(aggQD
.getMin() == null && aggQD
.getMax() != null){
418 if (missingMinMode
== MissingMinimumMode
.MinToZero
) {
419 aggQD
.setMinimum(BigDecimal
.valueOf(0f
), null);
420 }else if (missingMinMode
== MissingMinimumMode
.MinToMax
){
421 aggQD
.setMinimum(aggQD
.getMax(), null);
422 }else if (missingMinMode
== MissingMinimumMode
.SkipRecord
){
426 if(aggQD
.getMax() == null && aggQD
.getMin() != null){
427 if (missingMaxMode
== MissingMaximumMode
.MaxToMin
){
428 aggQD
.setMaximum(aggQD
.getMin(), null);
429 }else if (missingMaxMode
== MissingMaximumMode
.SkipRecord
){
436 private QuantitativeData
mergeQuantitativeData(QuantitativeData aggQd
, QuantitativeData newQd
) {
438 newQd
= aggregateSingleQuantitativeData(newQd
); //alternatively we could check, if newQd is already basically aggregated, but for this we need a clear definition what the minimum requirements are and how ExactValues and MinMax if existing in parallel should be handled.
440 BigDecimal min
= null;
441 BigDecimal max
= null;
442 BigDecimal average
= null;
443 BigDecimal sampleSize
= null;
444 newQd
= handleMissingValues(newQd
);
448 min
= aggQd
.getMin().min(newQd
.getMin());
449 max
= aggQd
.getMax().max(newQd
.getMax());
450 if (newQd
.getSampleSize() != null && aggQd
.getSampleSize() != null){
451 sampleSize
= newQd
.getSampleSize().add(aggQd
.getSampleSize());
453 if (sampleSize
!= null && !sampleSize
.equals(0f
) && aggQd
.getAverage() != null && newQd
.getAverage() != null){
454 BigDecimal aggTotalSum
= aggQd
.getAverage().multiply(aggQd
.getSampleSize(), MathContext
.DECIMAL32
);
455 BigDecimal newTotalSum
= newQd
.getAverage().multiply(newQd
.getSampleSize(), MathContext
.DECIMAL32
);
456 BigDecimal totalSum
= aggTotalSum
.add(newTotalSum
);
457 average
= totalSum
.divide(sampleSize
, MathContext
.DECIMAL32
).stripTrailingZeros(); //to be discussed if we really want to reduce precision here, however, due to the current way to compute average we do not have exact precision anyway
459 aggQd
.setMinimum(min
, null);
460 aggQd
.setMaximum(max
, null);
461 aggQd
.setSampleSize(sampleSize
, null);
462 aggQd
.setAverage(average
, null);
466 private static List
<BigDecimal
> getExactValues(QuantitativeData qd
) {
467 List
<BigDecimal
> exactValues
= qd
.getStatisticalValues().stream()
468 .filter(value
->value
.getType().equals(StatisticalMeasure
.EXACT_VALUE()))
469 .map(exact
->exact
.getValue())
470 .collect(Collectors
.toList());
474 private static boolean hasSameState(StateData sd1
, StateData sd2
) {
475 return sd2
.getState().getUuid().equals(sd1
.getState().getUuid());