cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/description/StructuredDescriptionAggregation.java

   1 /**
   2 * Copyright (C) 2019 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9 package eu.etaxonomy.cdm.api.service.description;
  10
  11 import java.math.BigDecimal;
  12 import java.math.MathContext;
  13 import java.util.ArrayList;
  14 import java.util.Comparator;
  15 import java.util.HashMap;
  16 import java.util.HashSet;
  17 import java.util.List;
  18 import java.util.Map;
  19 import java.util.Map.Entry;
  20 import java.util.Set;
  21 import java.util.stream.Collectors;
  22
  23 import eu.etaxonomy.cdm.common.BigDecimalUtil;
  24 import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
  25 import eu.etaxonomy.cdm.model.common.CdmBase;
  26 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
  27 import eu.etaxonomy.cdm.model.description.CategoricalData;
  28 import eu.etaxonomy.cdm.model.description.DescriptionBase;
  29 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  30 import eu.etaxonomy.cdm.model.description.DescriptionType;
  31 import eu.etaxonomy.cdm.model.description.DescriptiveDataSet;
  32 import eu.etaxonomy.cdm.model.description.Feature;
  33 import eu.etaxonomy.cdm.model.description.IndividualsAssociation;
  34 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  35 import eu.etaxonomy.cdm.model.description.SpecimenDescription;
  36 import eu.etaxonomy.cdm.model.description.State;
  37 import eu.etaxonomy.cdm.model.description.StateData;
  38 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
  39 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  40 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
  41 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
  42 import eu.etaxonomy.cdm.model.taxon.Taxon;
  43 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
  44
  45 /**
  46  * Aggregates the character data for a given {@link DescriptiveDataSet}.<br>
  47  * <br>
  48  * For all {@link SpecimenDescription}s belonging to this data set a new
  49  * aggregated {@link TaxonDescription}s are created for every taxon the
  50  * specimens are directly associated with.<BR>
  51  * Also lower rank taxon descriptions are aggregated to upper rank taxa.
  52  *
  53  * @author a.mueller
  54  * @author p.plitzner
  55  * @since 03.11.2019
  56  */
  57 public class StructuredDescriptionAggregation
  58         extends DescriptionAggregationBase<StructuredDescriptionAggregation, StructuredDescriptionAggregationConfiguration>{
  59
  60     private DescriptiveDataSet dataSet;
  61
  62     @Override
  63     protected String pluralDataType(){
  64         return "structured descriptive data";
  65     }
  66
  67     @Override
  68     protected void preAggregate(IProgressMonitor monitor) {
  69         monitor.subTask("preAccumulate - nothing to do");
  70
  71         // take start time for performance testing
  72         double start = System.currentTimeMillis();
  73
  74         getResult().setCdmEntity(getDescriptiveDatasetService().load(getConfig().getDatasetUuid()));
  75
  76         double end1 = System.currentTimeMillis();
  77         logger.info("Time elapsed for pre-accumulate() : " + (end1 - start) / (1000) + "s");
  78     }
  79
  80
  81     private boolean hasCharacterData(DescriptionElementBase element) {
  82         return hasCategoricalData(element) || hasQuantitativeData(element);
  83     }
  84
  85     private boolean hasQuantitativeData(DescriptionElementBase element) {
  86         if(element instanceof QuantitativeData
  87                 && !((QuantitativeData) element).getStatisticalValues().isEmpty()){
  88             QuantitativeData quantitativeData = (QuantitativeData)element;
  89             return !getExactValues(quantitativeData).isEmpty()
  90                     || quantitativeData.getMin()!=null
  91                     || quantitativeData.getMax()!=null;
  92         }
  93         return false;
  94     }
  95
  96     private boolean hasCategoricalData(DescriptionElementBase element) {
  97         return element instanceof CategoricalData && !((CategoricalData) element).getStatesOnly().isEmpty();
  98     }
  99
 100     @Override
 101     protected void setDescriptionTitle(TaxonDescription description, Taxon taxon) {
 102         String title = taxon.getName() != null? taxon.getName().getTitleCache() : taxon.getTitleCache();
 103         description.setTitleCache("Aggregated description for " + title, true);
 104         return;
 105     }
 106
 107     @Override
 108     protected TaxonDescription createNewDescription(Taxon taxon) {
 109         String title = taxon.getTitleCache();
 110         logger.debug("creating new description for " + title);
 111         TaxonDescription description = TaxonDescription.NewInstance(taxon);
 112         description.addType(DescriptionType.AGGREGATED_STRUC_DESC);
 113         setDescriptionTitle(description, taxon);
 114         return description;
 115     }
 116
 117     @Override
 118     protected boolean hasDescriptionType(TaxonDescription description) {
 119         return dataSet.getDescriptions().contains(description) && description.isAggregatedStructuredDescription();
 120     }
 121
 122     @Override
 123     protected List<String> descriptionInitStrategy() {
 124         return new ArrayList<>();
 125     }
 126
 127     @Override
 128     protected void addAggregationResultToDescription(TaxonDescription targetDescription,
 129             ResultHolder resultHolder) {
 130         StructuredDescriptionResultHolder structuredResultHolder = (StructuredDescriptionResultHolder)resultHolder;
 131
 132         replaceExistingDescriptionElements(targetDescription, structuredResultHolder.categoricalMap);
 133         replaceExistingDescriptionElements(targetDescription, structuredResultHolder.quantitativeMap);
 134         addAggregationSources(targetDescription, structuredResultHolder);
 135
 136         if(!targetDescription.getElements().isEmpty()){
 137             dataSet.addDescription(targetDescription);
 138         }
 139     }
 140
 141     private void addAggregationSources(TaxonDescription targetDescription,
 142             StructuredDescriptionResultHolder structuredResultHolder) {
 143         //FIXME Re-use sources if possible
 144         //Remove sources from description
 145         Set<IdentifiableSource> sourcesToRemove = targetDescription.getSources().stream()
 146                 .filter(source->source.getType().equals(OriginalSourceType.Aggregation))
 147                 .collect(Collectors.toSet());
 148
 149         for (IdentifiableSource source : sourcesToRemove) {
 150             targetDescription.removeSource(source);
 151         }
 152
 153         Set<DescriptionBase<?>> sourceDescriptions = structuredResultHolder.sourceDescriptions;
 154         for (DescriptionBase<?> descriptionBase : sourceDescriptions) {
 155             DescriptionBase<?> sourceDescription = null;
 156             if(descriptionBase.isInstanceOf(SpecimenDescription.class)){
 157                 DescriptionBase<?> clone = descriptionBase.clone();
 158                 clone.removeDescriptiveDataSet(dataSet);
 159                 clone.getTypes().add(DescriptionType.CLONE_FOR_SOURCE);
 160                 SpecimenOrObservationBase<?> specimen = CdmBase.deproxy(descriptionBase, SpecimenDescription.class).getDescribedSpecimenOrObservation();
 161                 specimen.addDescription(CdmBase.deproxy(clone, SpecimenDescription.class));
 162                 sourceDescription=clone;
 163             }
 164             else if(descriptionBase.isInstanceOf(TaxonDescription.class)){
 165                 Taxon taxon = CdmBase.deproxy(descriptionBase, TaxonDescription.class).getTaxon();
 166                 taxon.addDescription(CdmBase.deproxy(descriptionBase, TaxonDescription.class));
 167                 sourceDescription=descriptionBase;
 168             }
 169             if(sourceDescription!=null){
 170                 targetDescription.addAggregationSource(sourceDescription);
 171             }
 172         }
 173     }
 174
 175     private void replaceExistingDescriptionElements(TaxonDescription targetDescription,
 176             Map<Feature, ? extends DescriptionElementBase> elementMap) {
 177         for (Entry<Feature, ? extends DescriptionElementBase> entry : elementMap.entrySet()) {
 178             DescriptionElementBase elementToRemove = null;
 179             DescriptionElementBase elementReplacement = null;
 180             for (DescriptionElementBase descriptionElementBase : targetDescription.getElements()) {
 181                 if(descriptionElementBase.getFeature().equals(entry.getKey())){
 182                     elementToRemove = descriptionElementBase;
 183                     elementReplacement = entry.getValue();
 184                     break;
 185                 }
 186             }
 187             if(elementToRemove!=null && elementReplacement!=null){
 188                 targetDescription.removeElement(elementToRemove);
 189                 targetDescription.addElement(elementReplacement);
 190             }
 191             else{
 192                 targetDescription.addElement(entry.getValue());
 193             }
 194         }
 195     }
 196
 197     @Override
 198     protected void initTransaction() {
 199         dataSet = getDescriptiveDatasetService().load(getConfig().getDatasetUuid());
 200     }
 201
 202     @Override
 203     protected void removeDescriptionIfEmpty(TaxonDescription description) {
 204         super.removeDescriptionIfEmpty(description);
 205         if (description.getElements().isEmpty()){
 206             dataSet.removeDescription(description);
 207         }
 208     }
 209
 210     @Override
 211     protected void aggregateToParentTaxon(TaxonNode taxonNode,
 212             ResultHolder resultHolder,
 213             Set<TaxonDescription> excludedDescriptions) {
 214         StructuredDescriptionResultHolder descriptiveResultHolder = (StructuredDescriptionResultHolder)resultHolder;
 215         Set<TaxonDescription> childDescriptions = getChildTaxonDescriptions(taxonNode, dataSet);
 216         addDescriptionElement(descriptiveResultHolder, childDescriptions);
 217     }
 218
 219     @Override
 220     protected void aggregateWithinSingleTaxon(Taxon taxon,
 221             ResultHolder resultHolder,
 222             Set<TaxonDescription> excludedDescriptions) {
 223         StructuredDescriptionResultHolder descriptiveResultHolder = (StructuredDescriptionResultHolder)resultHolder;
 224         Set<SpecimenDescription> specimenDescriptions = getSpecimenDescriptions(taxon, dataSet);
 225         addDescriptionElement(descriptiveResultHolder, specimenDescriptions);
 226         Set<TaxonDescription> literatureDescriptions = getLiteratureDescriptions(taxon, dataSet);
 227         addDescriptionElement(descriptiveResultHolder, literatureDescriptions);
 228         //TODO add defaultDescriptions
 229
 230     }
 231
 232     private void addDescriptionElement(StructuredDescriptionResultHolder descriptiveResultHolder,
 233             Set<? extends DescriptionBase<?>> descriptions) {
 234         boolean descriptionWasUsed = false;
 235         for (DescriptionBase<?> desc: descriptions){
 236             for (DescriptionElementBase deb: desc.getElements()){
 237                 if (hasCharacterData(deb)){
 238                     if (deb.isInstanceOf(CategoricalData.class)){
 239                         addToCategorical(CdmBase.deproxy(deb, CategoricalData.class), descriptiveResultHolder);
 240                         descriptionWasUsed = true;
 241                     }else if (deb.isInstanceOf(QuantitativeData.class)){
 242                         addToQuantitative(CdmBase.deproxy(deb, QuantitativeData.class), descriptiveResultHolder);
 243                         descriptionWasUsed = true;
 244                     }
 245                 }
 246             }
 247             if(descriptionWasUsed){
 248                 descriptiveResultHolder.sourceDescriptions.add(desc);
 249             }
 250         }
 251     }
 252
 253     private void addToQuantitative(QuantitativeData qd, StructuredDescriptionResultHolder resultHolder) {
 254         QuantitativeData aggregatedQuantitativeData = resultHolder.quantitativeMap.get(qd.getFeature());
 255         if(aggregatedQuantitativeData==null){
 256             // no QuantitativeData with this feature in aggregation
 257             aggregatedQuantitativeData = aggregateSingleQuantitativeData(qd);
 258         }
 259         else{
 260             aggregatedQuantitativeData = mergeQuantitativeData(aggregatedQuantitativeData, qd);
 261         }
 262         if (aggregatedQuantitativeData != null){
 263             resultHolder.quantitativeMap.put(qd.getFeature(), aggregatedQuantitativeData);
 264         }
 265     }
 266
 267     private void addToCategorical(CategoricalData cd, StructuredDescriptionResultHolder resultHolder) {
 268         CategoricalData aggregatedCategoricalData = resultHolder.categoricalMap.get(cd.getFeature());
 269         if(aggregatedCategoricalData==null){
 270             // no CategoricalData with this feature in aggregation
 271             aggregatedCategoricalData = cd.clone();
 272             // set count to 1 if not set
 273             aggregatedCategoricalData.getStateData().stream().filter(sd->sd.getCount()==null).forEach(sd->sd.incrementCount());
 274             resultHolder.categoricalMap.put(aggregatedCategoricalData.getFeature(), aggregatedCategoricalData);
 275         }
 276         else{
 277             // split all StateData into those where the state already exists and those where it doesn't
 278             List<State> statesOnly = aggregatedCategoricalData.getStatesOnly();
 279             List<StateData> sdWithExistingStateInAggregation = cd.getStateData().stream().filter(sd->statesOnly.contains(sd.getState())).collect(Collectors.toList());
 280             List<StateData> sdWithNoExistingStateInAggregation = cd.getStateData().stream().filter(sd->!statesOnly.contains(sd.getState())).collect(Collectors.toList());
 281
 282             for (StateData sd : sdWithNoExistingStateInAggregation) {
 283                 StateData clone = sd.clone();
 284                 // set count to 1 if not set
 285                 if(clone.getCount()==null){
 286                     clone.incrementCount();
 287                 }
 288                 aggregatedCategoricalData.addStateData(clone);
 289             }
 290
 291             for (StateData sdExist : sdWithExistingStateInAggregation) {
 292                 List<StateData> aggregatedSameStateData = aggregatedCategoricalData.getStateData().stream()
 293                 .filter(sd->hasSameState(sdExist, sd))
 294                 .collect(Collectors.toList());
 295                 for (StateData stateData : aggregatedSameStateData) {
 296                     if(sdExist.getCount()==null){
 297                         stateData.incrementCount();
 298                     }
 299                     else{
 300                         stateData.setCount(stateData.getCount()+sdExist.getCount());
 301                     }
 302                 }
 303             }
 304         }
 305     }
 306
 307     @Override
 308     protected StructuredDescriptionResultHolder createResultHolder() {
 309         return new StructuredDescriptionResultHolder();
 310     }
 311
 312     private class StructuredDescriptionResultHolder implements ResultHolder{
 313         Map<Feature, CategoricalData> categoricalMap = new HashMap<>();
 314         Map<Feature, QuantitativeData> quantitativeMap = new HashMap<>();
 315         Set<DescriptionBase<?>> sourceDescriptions = new HashSet<>();
 316         @Override
 317         public String toString() {
 318             return "SDResultHolder [categoricals=" + categoricalMap.size() + ", quantitatives="
 319                     + quantitativeMap.size() + ", sourceDescriptions=" + sourceDescriptions.size() + "]";
 320         }
 321     }
 322
 323     private Set<TaxonDescription> getChildTaxonDescriptions(TaxonNode taxonNode, DescriptiveDataSet dataSet) {
 324         Set<TaxonDescription> result = new HashSet<>();
 325         List<TaxonNode> childNodes = taxonNode.getChildNodes();
 326         for (TaxonNode childNode : childNodes) {
 327             Set<TaxonDescription> childDescriptions = childNode.getTaxon().getDescriptions();
 328             result.addAll(childDescriptions.stream()
 329                 .filter(desc->desc.getTypes().contains(DescriptionType.AGGREGATED_STRUC_DESC))
 330                 .filter(desc->dataSet.getDescriptions().contains(desc))
 331                 .collect(Collectors.toSet()));
 332         }
 333         return result;
 334     }
 335
 336     private Set<SpecimenDescription> getSpecimenDescriptions(Taxon taxon, DescriptiveDataSet dataSet) {
 337         Set<SpecimenDescription> result = new HashSet<>();
 338         //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
 339         for (TaxonDescription taxonDesc: taxon.getDescriptions()){
 340             for (DescriptionElementBase taxonDeb : taxonDesc.getElements()){
 341                 if (taxonDeb.isInstanceOf(IndividualsAssociation.class)){
 342                     IndividualsAssociation indAss = CdmBase.deproxy(taxonDeb, IndividualsAssociation.class);
 343                     SpecimenOrObservationBase<?> specimen = indAss.getAssociatedSpecimenOrObservation();
 344                     @SuppressWarnings({ "unchecked", "rawtypes" })
 345                     Set<SpecimenDescription> descriptions = (Set)specimen.getDescriptions();
 346                     for(SpecimenDescription specimenDescription : descriptions){
 347                         if(dataSet.getDescriptions().contains(specimenDescription) &&
 348                                 specimenDescription.getTypes().stream().noneMatch(type->type.equals(DescriptionType.CLONE_FOR_SOURCE))){
 349                             result.add(specimenDescription);
 350                         }
 351                     }
 352                 }
 353             }
 354         }
 355         return result;
 356     }
 357
 358     private Set<TaxonDescription> getLiteratureDescriptions(Taxon taxon, DescriptiveDataSet dataSet) {
 359         Set<TaxonDescription> result = new HashSet<>();
 360         //TODO performance: use DTO service to retrieve specimen descriptions without initializing all taxon descriptions
 361         for(TaxonDescription taxonDescription : taxon.getDescriptions()){
 362             if(dataSet.getDescriptions().contains(taxonDescription)
 363                     && taxonDescription.getTypes().stream().anyMatch(type->type.equals(DescriptionType.SECONDARY_DATA))
 364                     && taxonDescription.getTypes().stream().noneMatch(type->type.equals(DescriptionType.CLONE_FOR_SOURCE)) ){
 365                 result.add(taxonDescription);
 366             }
 367         }
 368         return result;
 369     }
 370
 371     private QuantitativeData aggregateSingleQuantitativeData(QuantitativeData sourceQd){
 372         QuantitativeData aggQD = QuantitativeData.NewInstance(sourceQd.getFeature());
 373         Set<BigDecimal> exactValues = sourceQd.getExactValues();
 374         if(!exactValues.isEmpty()){
 375             Comparator<BigDecimal> comp = Comparator.naturalOrder();
 376             // qd is not already aggregated
 377             int exactValueSampleSize = exactValues.size();
 378             BigDecimal exactValueMin = exactValues.stream().min(comp).get();
 379             BigDecimal exactValueMax = exactValues.stream().max(comp).get();
 380             BigDecimal exactValueAvg = BigDecimalUtil.average(exactValues);
 381             //TODO also check for typical boundary data
 382             if(sourceQd.getMin() == null && sourceQd.getMax() == null){
 383                 aggQD.setSampleSize(new BigDecimal(exactValueSampleSize), null);
 384                 aggQD.setAverage(exactValueAvg, null);
 385             }
 386             aggQD.setMinimum(sourceQd.getMin() == null ? exactValueMin: sourceQd.getMin().min(exactValueMin), null);
 387             aggQD.setMaximum(sourceQd.getMax() == null ? exactValueMax: sourceQd.getMax().max(exactValueMax), null);
 388         }
 389         else{
 390             // qd has only min, max, ... but no exact values
 391             aggQD = sourceQd.clone();
 392             aggQD = handleMissingValues(aggQD);
 393         }
 394         return aggQD;
 395     }
 396
 397     private QuantitativeData handleMissingValues(QuantitativeData qd) {
 398         //min max
 399         qd = handleMissingMinOrMax(qd);
 400         //average
 401         if (qd != null && qd.getAverage() == null){
 402             BigDecimal n = qd.getSampleSize();
 403             if(n != null && !n.equals(0f)){
 404                 BigDecimal average = (qd.getMax().add(qd.getMin())).divide(n);
 405                 qd.setAverage(average, null);
 406             }
 407         }
 408         return qd;
 409     }
 410
 411     private QuantitativeData handleMissingMinOrMax(QuantitativeData qd) {
 412         return handleMissingMinOrMax(qd, getConfig().getMissingMinimumMode(), getConfig().getMissingMaximumMode());
 413     }
 414
 415     public static QuantitativeData handleMissingMinOrMax(QuantitativeData aggQD, MissingMinimumMode missingMinMode,
 416             MissingMaximumMode missingMaxMode) {
 417         if(aggQD.getMin() == null && aggQD.getMax() != null){
 418             if (missingMinMode == MissingMinimumMode.MinToZero) {
 419                 aggQD.setMinimum(BigDecimal.valueOf(0f), null);
 420             }else if (missingMinMode == MissingMinimumMode.MinToMax){
 421                 aggQD.setMinimum(aggQD.getMax(), null);
 422             }else if (missingMinMode == MissingMinimumMode.SkipRecord){
 423                 return null;
 424             }
 425         }
 426         if(aggQD.getMax() == null && aggQD.getMin() != null){
 427             if (missingMaxMode == MissingMaximumMode.MaxToMin){
 428                 aggQD.setMaximum(aggQD.getMin(), null);
 429             }else if (missingMaxMode == MissingMaximumMode.SkipRecord){
 430                 return null;
 431             }
 432         }
 433         return aggQD;
 434     }
 435
 436     private QuantitativeData mergeQuantitativeData(QuantitativeData aggQd, QuantitativeData newQd) {
 437
 438         newQd = aggregateSingleQuantitativeData(newQd); //alternatively we could check, if newQd is already basically aggregated, but for this we need a clear definition what the minimum requirements are and how ExactValues and MinMax if existing in parallel should be handled.
 439
 440         BigDecimal min = null;
 441         BigDecimal max = null;
 442         BigDecimal average = null;
 443         BigDecimal sampleSize = null;
 444         newQd = handleMissingValues(newQd);
 445         if (newQd == null){
 446             return aggQd;
 447         }
 448         min = aggQd.getMin().min(newQd.getMin());
 449         max = aggQd.getMax().max(newQd.getMax());
 450         if (newQd.getSampleSize() != null && aggQd.getSampleSize() != null){
 451             sampleSize = newQd.getSampleSize().add(aggQd.getSampleSize());
 452         }
 453         if (sampleSize != null && !sampleSize.equals(0f) && aggQd.getAverage() != null && newQd.getAverage() != null){
 454             BigDecimal aggTotalSum = aggQd.getAverage().multiply(aggQd.getSampleSize(), MathContext.DECIMAL32);
 455             BigDecimal newTotalSum = newQd.getAverage().multiply(newQd.getSampleSize(), MathContext.DECIMAL32);
 456             BigDecimal totalSum = aggTotalSum.add(newTotalSum);
 457             average = totalSum.divide(sampleSize, MathContext.DECIMAL32).stripTrailingZeros();  //to be discussed if we really want to reduce precision here, however, due to the current way to compute average we do not have exact precision anyway
 458         }
 459         aggQd.setMinimum(min, null);
 460         aggQd.setMaximum(max, null);
 461         aggQd.setSampleSize(sampleSize, null);
 462         aggQd.setAverage(average, null);
 463         return aggQd;
 464     }
 465
 466     private static List<BigDecimal> getExactValues(QuantitativeData qd) {
 467         List<BigDecimal> exactValues = qd.getStatisticalValues().stream()
 468                 .filter(value->value.getType().equals(StatisticalMeasure.EXACT_VALUE()))
 469                 .map(exact->exact.getValue())
 470                 .collect(Collectors.toList());
 471         return exactValues;
 472     }
 473
 474     private static boolean hasSameState(StateData sd1, StateData sd2) {
 475         return sd2.getState().getUuid().equals(sd1.getState().getUuid());
 476     }
 477 }