08f11b8bc11067f819df630210415b66c606bfe3
[cdmlib.git] / cdmlib-model / src / main / java / eu / etaxonomy / cdm / strategy / generate / PolytomousKeyGenerator.java
1 package eu.etaxonomy.cdm.strategy.generate;
2
3 import java.math.BigDecimal;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Collection;
7 import java.util.Comparator;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Iterator;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.UUID;
16
17 import org.apache.logging.log4j.LogManager;
18 import org.apache.logging.log4j.Logger;
19
20 import eu.etaxonomy.cdm.common.BigDecimalUtil;
21 import eu.etaxonomy.cdm.model.common.CdmBase;
22 import eu.etaxonomy.cdm.model.common.Language;
23 import eu.etaxonomy.cdm.model.description.CategoricalData;
24 import eu.etaxonomy.cdm.model.description.DescriptionBase;
25 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
26 import eu.etaxonomy.cdm.model.description.Feature;
27 import eu.etaxonomy.cdm.model.description.FeatureState;
28 import eu.etaxonomy.cdm.model.description.KeyStatement;
29 import eu.etaxonomy.cdm.model.description.PolytomousKey;
30 import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
31 import eu.etaxonomy.cdm.model.description.QuantitativeData;
32 import eu.etaxonomy.cdm.model.description.SpecimenDescription;
33 import eu.etaxonomy.cdm.model.description.StateData;
34 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
35 import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
36 import eu.etaxonomy.cdm.model.description.TaxonDescription;
37 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
38 import eu.etaxonomy.cdm.model.taxon.Taxon;
39 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
40 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
41 import eu.etaxonomy.cdm.model.term.TermNode;
42
43 /**
44 * @author m.venin
45 * @author a.mueller
46 */
47 public class PolytomousKeyGenerator {
48
49 private static final Logger logger = LogManager.getLogger(PolytomousKeyGenerator.class);
50
51 /**
52 * Strings used for generating the statements of the key.
53 */
54 private static final String before="<";
55 private static final String after=">";
56 private static final String separator = " or ";
57
58
59 private PolytomousKeyGeneratorConfigurator config;
60
61 private Map<FeatureState,Set<Feature>> iAifDependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
62 private Map<FeatureState,Set<Feature>> oAifDependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
63 private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
64
65 private class KeyTaxon{
66 private UUID uuid;
67 private Taxon taxon;
68 private SpecimenOrObservationBase<?> specimen;
69 private Map<Feature,Set<CategoricalData>> categoricalData = new HashMap<>();
70 private Map<Feature,Set<QuantitativeData>> quantitativeData = new HashMap<>();
71 private Set<KeyTaxon> children = new HashSet<>();
72
73 private Set<CategoricalData> getCategoricalData(Feature feature){
74 return categoricalData.get(feature) == null? new HashSet<>():categoricalData.get(feature);
75 }
76 private Set<QuantitativeData> getQuantitativeData(Feature feature){
77 return quantitativeData.get(feature) == null? new HashSet<>():quantitativeData.get(feature);
78 }
79
80 private void addDescription(DescriptionBase<?> db) {
81 for (DescriptionElementBase deb : db.getElements()){
82 Feature feature = deb.getFeature();
83 if (deb.isCharacterData()){
84 if (deb.isInstanceOf(CategoricalData.class)){
85 CategoricalData cd = CdmBase.deproxy(deb, CategoricalData.class);
86 if (categoricalData.get(feature)== null){
87 categoricalData.put(feature, new HashSet<>());
88 }
89 categoricalData.get(feature).add(cd);
90 }else if (deb.isInstanceOf(QuantitativeData.class)){
91 QuantitativeData qd = CdmBase.deproxy(deb, QuantitativeData.class);
92 if (quantitativeData.get(feature)== null){
93 quantitativeData.put(feature, new HashSet<>());
94 }
95 quantitativeData.get(feature).add(qd);
96 }
97 }
98 }
99 }
100
101 @Override
102 public String toString() {
103 return taxon.getTitleCache(); // + "KeyTaxon [uuid=" + uuid + (taxon != null? ", " + taxon.getTitleCache():"") + "]"; // uuid=" + uuid +
104 }
105 }
106
107 // *************************** METHODS ***************************************/
108
109 /**
110 * Creates the key
111 */
112 public PolytomousKey invoke(PolytomousKeyGeneratorConfigurator config){
113 if (config == null){
114 throw new NullPointerException("PolytomousKeyGeneratorConfigurator must not be null");
115 }
116 this.config = config;
117 if (this.config.isUseDependencies()){
118 createDependencies(config.getDependenciesTree().getRoot());
119 }
120 PolytomousKey polytomousKey = PolytomousKey.NewInstance();
121 PolytomousKeyNode root = polytomousKey.getRoot();
122 @SuppressWarnings("unchecked")
123 Set<KeyTaxon> taxaCovered = makeKeyTaxa((Set)config.getTaxonDescriptions());
124 taxaCovered = replaceSingleRoot(taxaCovered);
125
126 //filter if a feature is available only for certain states in this branche
127 Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter = new HashMap<>();
128 //TODO what if size(taxaCovered) <= 1, is this covered by algo? Write test to check
129 buildBranches(root, config.getFeatures(), taxaCovered, featureStatesFilter);
130 return polytomousKey;
131 }
132
133 /**
134 * If the root taxon is a single taxon but has children
135 * it will be replaced by it's children.
136 */
137 private Set<KeyTaxon> replaceSingleRoot(Set<KeyTaxon> taxaCovered) {
138 if (this.config.isUseTaxonHierarchy() && taxaCovered.size()==1
139 && !taxaCovered.iterator().next().children.isEmpty()){
140 return replaceSingleRoot(taxaCovered.iterator().next().children);
141 }else{
142 return taxaCovered;
143 }
144 }
145
146 private Set<KeyTaxon> makeKeyTaxa(Set<DescriptionBase<?>> descriptions) {
147
148 Map<UUID,KeyTaxon> taxonMap = new HashMap<>();
149 for (DescriptionBase<?> db : descriptions){
150 KeyTaxon taxon = new KeyTaxon();
151 if (db.isInstanceOf(TaxonDescription.class)){
152 TaxonDescription td = CdmBase.deproxy(db, TaxonDescription.class);
153 taxon.uuid = td.getTaxon().getUuid();
154 taxon.taxon = td.getTaxon();
155 }else if (db.isInstanceOf(SpecimenDescription.class)){
156 SpecimenDescription sd = CdmBase.deproxy(db, SpecimenDescription.class);
157 taxon.uuid = sd.getDescribedSpecimenOrObservation().getUuid();
158 taxon.specimen = sd.getDescribedSpecimenOrObservation();
159 }else{
160 throw new RuntimeException("Unhandled entity type " + db.getClass().getName());
161 }
162 if (taxonMap.get(taxon.uuid)!= null){
163 taxon = taxonMap.get(taxon.uuid);
164 }else{
165 taxonMap.put(taxon.uuid, taxon);
166 }
167 taxon.addDescription(db);
168 }
169
170 createTaxonHierarchy(taxonMap);
171
172 return new HashSet<>(taxonMap.values());
173 }
174
175 private void createTaxonHierarchy(Map<UUID, KeyTaxon> taxonMap) {
176 if(config.isUseTaxonHierarchy()==false){
177 return;
178 }
179 Set<KeyTaxon> taxaToTest = new HashSet<>(taxonMap.values());
180 for(KeyTaxon taxon:taxaToTest){
181 KeyTaxon parent = getBestTaxonParent(taxon, taxaToTest);
182 if (parent != null){
183 parent.children.add(taxon);
184 taxonMap.remove(taxon.uuid);
185 }
186 }
187 }
188
189 private KeyTaxon getBestTaxonParent(KeyTaxon taxon, Collection<KeyTaxon> values) {
190 KeyTaxon parent = null;
191 String parentIndex = "";
192 String myTreeIndex = getTaxonTreeIndex(taxon);
193 if (myTreeIndex != null) {
194 for (KeyTaxon candidate:values){
195 String candidateIndex = getTaxonTreeIndex(candidate);
196 if (candidateIndex == null || myTreeIndex.equals(candidateIndex)){
197 continue;
198 }
199 if (myTreeIndex.startsWith(candidateIndex)){
200 if (candidateIndex.length()> parentIndex.length()){
201 parent = candidate;
202 parentIndex = candidateIndex;
203 }
204 }
205 }
206 }
207 return parent;
208 }
209
210 private String getTaxonTreeIndex(KeyTaxon taxon) {
211 if (taxon.taxon.getTaxonNodes().isEmpty()){
212 return null;
213 }
214 //TODOO size>1 or classification check
215 TaxonNode node = taxon.taxon.getTaxonNodes().iterator().next();
216 String treeIndex = node.treeIndex();
217 if (treeIndex == null){
218 //unpersisted, this should only happen during test, create provisional treeindex
219 treeIndex = getParentTreeIndex(node) + node.getUuid().toString() + "#" ;
220 }
221 return treeIndex;
222 }
223
224 private String getParentTreeIndex(TaxonNode node) {
225 TaxonNode parent = node.getParent();
226 if (parent == null ){
227 return "#";
228 }else{
229 return getParentTreeIndex(parent) + parent.getUuid().toString() + "#" ;
230 }
231 }
232
233 /**
234 * Recursive function that builds the branches of the identification key
235 *
236 * @param parent the node considered
237 * @param featuresLeft List of features that can be used at this point
238 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
239 * @param featureStatesFilter
240 */
241 private void buildBranches(PolytomousKeyNode parent, List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
242 Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
243
244 //handle all branches taxa =>
245 Set<KeyTaxon> allBranchesTaxa = getAllBranchesTaxa(featuresLeft, taxaCovered, featureStatesFilter);
246 if (allBranchesTaxa.size()>0){
247 if (allBranchesTaxa.size()>1){
248 //TODO test if this case in handled and displayed correctly
249 logger.warn(">1 final taxa in inner node");
250 }
251 taxaCovered.removeAll(allBranchesTaxa);
252 if(taxaCovered.size() != 1){
253 handleLeaf(parent, allBranchesTaxa);
254 }else{
255 //if only 1 is left it is better to handle all remaining in sub-branch to make difference clearer
256 taxaCovered.addAll(allBranchesTaxa);
257 }
258 }
259
260 //start real branching
261 if (taxaCovered.size()<=1){
262 //do nothing
263 logger.warn("Only 1 or no taxon covered. This should currently only be possible on top level and is not yet handled. ");
264 }else {
265 // this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
266 Map<Feature,BigDecimal> quantitativeFeaturesThresholds = new HashMap<>();
267 // the scores of the different features are calculated, the thresholds in the same time
268 if (config.isDebug()){
269 System.out.println("Feature left: " + featuresLeft + ", taxa: " + taxaCovered);
270 }
271 Feature winnerFeature = computeScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
272
273 if(winnerFeature != null){
274 /************** either the feature supports quantitative data... **************/
275 // NB: in this version, "quantitative features" are dealt with in a dichotomous way
276 if (winnerFeature.isSupportsQuantitativeData()) {
277 featuresLeft.add(winnerFeature); //as quantitative data are currently only split in 2 parts there might be further splits possible and needed
278 handleQuantitativeData(parent, featuresLeft, taxaCovered,
279 quantitativeFeaturesThresholds, winnerFeature, featureStatesFilter);
280 }
281 /************** ...or it supports categorical data. **************/
282 else if (winnerFeature.isSupportsCategoricalData()) {
283 handleCategorialFeature(parent, featuresLeft, taxaCovered,
284 winnerFeature, featureStatesFilter);
285 }else{
286 throw new RuntimeException("Winner feature does not support character data.");
287 }
288 // the winner features are put back to the features left once the branch is done
289 if (!featuresLeft.contains(winnerFeature)){ //TODO why is a list and not a set?
290 featuresLeft.add(winnerFeature);
291 }
292 }else if (featuresLeft.isEmpty()){
293 handleLeaf(parent, taxaCovered);
294 }else{
295 throw new RuntimeException("No winner feature but features left to handle should not happen.");
296 }
297 }
298 return;
299 }
300
301 private Set<KeyTaxon> getAllBranchesTaxa(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
302 Map<Feature, Set<eu.etaxonomy.cdm.model.term.DefinedTermBase<?>>> featureStatesFilter) {
303
304 Set<KeyTaxon> candidates = new HashSet<>(taxaCovered);
305 List<Feature> dependendFeatures = new ArrayList<>();
306 for (Feature feature : featuresLeft){
307 if(feature.isSupportsCategoricalData()){
308 Set<DefinedTermBase<?>> allStates = getAllStates(feature, taxaCovered, featureStatesFilter.get(feature));
309 Iterator<KeyTaxon> it = candidates.iterator();
310 while (it.hasNext()){
311 Set<KeyTaxon> taxonSet = new HashSet<>(Arrays.asList(it.next()));
312 Set<DefinedTermBase<?>> taxonStates = getAllStates(feature, taxonSet, featureStatesFilter.get(feature));
313 if(allStates.size() > taxonStates.size()){
314 it.remove();
315 }
316 }
317 if(candidates.isEmpty()){
318 break;
319 }else{
320 addDependentFeatures(dependendFeatures, feature, new HashSet<>(), new ArrayList<>(allStates));
321 }
322 }else if (feature.isSupportsQuantitativeData()){
323 Iterator<KeyTaxon> it = candidates.iterator();
324 while (it.hasNext()){
325 BigDecimal min = BigDecimalUtil.MAX_BIGDECIMAL;
326 BigDecimal max = BigDecimalUtil.MIN_BIGDECIMAL;
327 Set<QuantitativeData> qds = it.next().quantitativeData.get(feature);
328 qds = qds == null? new HashSet<>(): qds;
329 for (QuantitativeData qd : qds){
330 BigDecimal qdMin = qd.getOverallMin();
331 if(qdMin != null){
332 min = min.min(qdMin);
333 }
334 BigDecimal qdMax = qd.getOverallMax();
335 if(qdMax != null){
336 max = max.max(qdMax);
337 }
338 }
339 boolean staysCandidate = true;
340 for(KeyTaxon taxon : taxaCovered){
341 Set<QuantitativeData> tqds = taxon.quantitativeData.get(feature);
342 tqds = tqds == null? new HashSet<>(): tqds;
343 for (QuantitativeData qd : tqds){
344 staysCandidate &= qd.getOverallMin() == null || qd.getOverallMin().compareTo(min) >= 0;
345 staysCandidate &= qd.getOverallMax() == null || qd.getOverallMax().compareTo(max) <= 0;
346 }
347 if (!staysCandidate){
348 break;
349 }
350 }
351 if (!staysCandidate){
352 it.remove();
353 }
354 }
355 }
356 }
357 if(config.isUseDependencies() && !dependendFeatures.isEmpty() && !candidates.isEmpty()){
358 Set<KeyTaxon> dependetCandidates = getAllBranchesTaxa(dependendFeatures, taxaCovered, featureStatesFilter);
359 candidates.retainAll(dependetCandidates);
360 }
361 return candidates;
362 }
363
364 /**
365 * Creates a leaf. It adds the taxa the parent taxon as linked taxa. Handles a
366 * list of multiple taxa and handles "specimen taxa" (not yet fully implemented)
367 * @return taxa which exist in ALL sub-branches and therefore can be linked on higher level
368 */
369 private Set<KeyTaxon> handleLeaf(PolytomousKeyNode parent, Set<KeyTaxon> taxaCovered) {
370 KeyStatement parentStatement = parent.getStatement();
371 for(KeyTaxon taxon: taxaCovered){
372 if (taxon.taxon != null){
373 parent.setOrAddTaxon(taxon.taxon);
374 }else{
375 //FIXME handle other descriptions like specimen descriptions better
376 if (parentStatement!=null){
377 String statementString = parentStatement.getLabelText(Language.DEFAULT());
378 if (statementString !=null && taxon.specimen != null){
379 String label = statementString + " --> " + taxon.specimen.getTitleCache();
380 parentStatement.putLabel(Language.DEFAULT(), label);
381 }
382 }
383 }
384 }
385 return taxaCovered;
386 }
387
388 /**
389 * "categorical features" may present several different states/branches,
390 * each one of these might correspond to one child.
391 */
392 private void handleCategorialFeature(PolytomousKeyNode parent, List<Feature> featuresLeft,
393 Set<KeyTaxon> taxaCovered,
394 Feature winnerFeature,
395 Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
396
397 Map<Set<KeyTaxon>,Boolean> reuseWinner = new HashMap<>();
398
399 Set<DefinedTermBase<?>> allStates = getAllStates(winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
400 // a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
401 // this key represents a single branch in the decision tree
402 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap
403 = determineCategoricalStates(allStates, winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
404
405 if (taxonStatesMap.size()<=1){
406 if (notEmpty(featureDependencies.get(winnerFeature))){
407 //TODO is empty list correctly handled here? Seems to happen if incorrect data (e.g. only Max values) for quantdata exist
408 List<DefinedTermBase<?>> stateList = taxonStatesMap.isEmpty()? new ArrayList<>(): taxonStatesMap.values().iterator().next();
409 Set<Feature> featuresAdded = new HashSet<>();
410 addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, stateList);
411 featuresLeft.remove(winnerFeature);
412 buildBranches(parent, featuresLeft, taxaCovered, featureStatesFilter);
413 removeAddedDependendFeatures(featuresLeft, featuresAdded);
414 }else{
415 //if only 1 branch is left we can handle this as a leaf, no matter how many taxa are left
416 handleLeaf(parent, taxaCovered);
417 }
418 }else {
419 // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
420 if (config.isMerge()){
421 taxonStatesMap = handleMerge(taxaCovered, winnerFeature, reuseWinner,
422 taxonStatesMap, allStates, featureStatesFilter.get(winnerFeature));
423 }
424 List<Set<KeyTaxon>> sortedKeys = sortKeys(taxonStatesMap);
425 for (Set<KeyTaxon> newTaxaCovered : sortedKeys){
426 //handle each branch
427 handleCategoricalBranch(parent, featuresLeft,
428 taxaCovered.size(), winnerFeature, reuseWinner, taxonStatesMap, newTaxaCovered, featureStatesFilter);
429 }
430 }
431 return;
432 }
433
434 private Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> handleMerge(Set<KeyTaxon> taxaCovered,
435 Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
436 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap, Set<DefinedTermBase<?>> allStates, Set<DefinedTermBase<?>> filter) {
437
438 // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
439 Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions = new HashMap<>();
440 computeExclusions(winnerFeature, taxaCovered, exclusions, filter);
441
442 while (!exclusions.isEmpty()){
443 // looks for the largest clique, i.e. the state with less exclusions
444 List<DefinedTermBase<?>> clique = returnBestClique(exclusions);
445 if(clique.containsAll(allStates)){
446 continue;
447 }
448 // then merges the corresponding branches
449 mergeBranches(clique, taxonStatesMap, reuseWinner, filter);
450 }
451 //during merge the keySet (set of taxa) may change, therefore they change their hashcode
452 //and can not be used as keys in the map anymore.
453 //Therefore we refill the map.
454 taxonStatesMap = renewTaxonStatesMap(taxonStatesMap);
455 return taxonStatesMap;
456 }
457
458 private void handleCategoricalBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
459 int taxaCoveredSize,
460 Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
461 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap,
462 Set<KeyTaxon> newTaxaCovered,
463 Map<Feature,Set<DefinedTermBase<?>>> featureStatesFilter) {
464
465 //to restore old state
466 Set<DefinedTermBase<?>> oldFilterSet = featureStatesFilter.get(winnerFeature);
467 Set<Feature> featuresAdded = new HashSet<>();
468
469 boolean areTheTaxaDiscriminated = false;
470 PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
471 parent.addChild(childNode);
472
473 List<DefinedTermBase<?>> listOfStates = taxonStatesMap.get(newTaxaCovered);
474 if ((newTaxaCovered.size() > 0)){ //old: if the taxa are discriminated compared to those of the parent node, a child is created
475 areTheTaxaDiscriminated = (newTaxaCovered.size() != taxaCoveredSize);
476
477 int numberOfStates = listOfStates.size()-1;
478 listOfStates.sort(stateComparator);
479
480 if (config.isUseDependencies()){
481 // old: if the dependencies are considered, removes and adds the right features from/to the list of features left
482 // these features are stored in order to be put back again when the current branch is finished
483
484 addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, listOfStates);
485 }
486
487 String statementLabel = createStatement(listOfStates, numberOfStates);
488 KeyStatement statement = KeyStatement.NewInstance(statementLabel);
489 childNode.setStatement(statement);
490 parent.setFeature(winnerFeature);
491
492 if (reuseWinner.get(newTaxaCovered)== Boolean.TRUE){
493 featuresLeft.add(winnerFeature);
494 setStatesFilter(featureStatesFilter, winnerFeature, listOfStates);
495 }else{
496 featuresLeft.remove(winnerFeature);
497 }
498 }
499
500 boolean hasChildren = areTheTaxaDiscriminated && (newTaxaCovered.size() > 1);
501 if (hasChildren){
502 buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
503 }else{
504 handleLeaf(childNode, newTaxaCovered);
505 Set<KeyTaxon> taxonChildren = getTaxonChildren(newTaxaCovered);
506 if(!taxonChildren.isEmpty()){
507 //TODO FIXME featuresLeft probably needs to include all features, similar for featureStatesFilter
508 buildBranches(childNode, featuresLeft, taxonChildren, featureStatesFilter);
509 }
510 }
511
512 //restore old state before returning to parent node
513 removeAddedDependendFeatures(featuresLeft, featuresAdded);
514 featureStatesFilter.put(winnerFeature, oldFilterSet);
515
516 return;
517 }
518
519 private Set<KeyTaxon> getTaxonChildren(Set<KeyTaxon> newTaxaCovered) {
520 Set<KeyTaxon> result = new HashSet<>();
521 for (KeyTaxon taxon:newTaxaCovered){
522 result.addAll(taxon.children);
523 }
524 return result;
525 }
526
527 private void setStatesFilter(Map<Feature, Set<DefinedTermBase<?>>> filter, Feature feature,
528 List<DefinedTermBase<?>> listOfStates) {
529 if (filter.get(feature)==null){
530 filter.put(feature, new HashSet<>(listOfStates));
531 }else{
532 Set<DefinedTermBase<?>> set = filter.get(feature);
533 set.retainAll(listOfStates);
534 }
535 }
536
537 private void removeAddedDependendFeatures(List<Feature> featuresLeft, Set<Feature> featuresAdded) {
538 for (Feature addedFeature : featuresAdded) {
539 featuresLeft.remove(addedFeature);
540 }
541 }
542
543 private void addDependentFeatures(List<Feature> featuresLeft, Feature baseFeature,
544 Set<Feature> featuresAdded, List<DefinedTermBase<?>> listOfStates) {
545
546 if(notEmpty(featureDependencies.get(baseFeature))){
547 Set<Feature> newFeatureCandidates = new HashSet<>(featureDependencies.get(baseFeature));
548 newFeatureCandidates.remove(null);
549 for (DefinedTermBase<?> state : listOfStates) {
550 //in-applicable
551 List<Feature> inapplicableFeatures = getApplicableFeatures(baseFeature, state, iAifDependencies);
552 newFeatureCandidates.removeAll(inapplicableFeatures);
553 //only-applicable
554 List<Feature> onlyApplicableFeatures = getApplicableFeatures(baseFeature, state, oAifDependencies);
555 if (!onlyApplicableFeatures.isEmpty()){
556 Iterator<Feature> it = newFeatureCandidates.iterator();
557 while (it.hasNext()){
558 Feature featureCandidate = it.next();
559 if (!onlyApplicableFeatures.contains(featureCandidate)){
560 it.remove();
561 }
562 }
563 }
564 }
565 featuresLeft.addAll(newFeatureCandidates);
566 featuresAdded.addAll(newFeatureCandidates);
567 }
568 }
569
570 private List<Feature> getApplicableFeatures(Feature feature, DefinedTermBase<?> state,
571 Map<FeatureState, Set<Feature>> applicabilityDependencies) {
572 List<Feature> result = new ArrayList<>();
573 for (FeatureState featureState : applicabilityDependencies.keySet()){
574 if(featureState.getFeature().equals(feature) && featureState.getState().equals(state)){
575 result.addAll(applicabilityDependencies.get(featureState));
576 }
577 }
578 return result;
579 }
580
581 private boolean notEmpty(Set<?> set) {
582 return (set != null) && !set.isEmpty();
583 }
584
585 private String createStatement(List<DefinedTermBase<?>> listOfStates, int numberOfStates) {
586 StringBuilder statementLabel = new StringBuilder();
587 for (DefinedTermBase<?> state : listOfStates) {
588 statementLabel.append(state.getLabel());
589 if (listOfStates.lastIndexOf(state)!=numberOfStates){
590 statementLabel.append(separator); // append a separator after each state except for the last one
591 }
592 }
593 return statementLabel.toString();
594 }
595
596 private Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> renewTaxonStatesMap(Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap) {
597 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> result = new HashMap<>();
598 for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> entry : taxonStatesMap.entrySet()){
599 result.put(entry.getKey(), entry.getValue());
600 }
601 return result;
602 }
603
604 private List<Set<KeyTaxon>> sortKeys(Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap) {
605 //for now this is a dummy sorting
606 List<Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>>> sortedEntries = new ArrayList<>();
607 sortedEntries.addAll(taxonStatesMap.entrySet());
608
609 sortedEntries.sort(entryComparator);
610 List<Set<KeyTaxon>> result = new ArrayList<>();
611 for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> entry : sortedEntries){
612 result.add(entry.getKey());
613 }
614 return result;
615 }
616
617 //TODO use a general term comparator here
618 private static final Comparator<DefinedTermBase<?>> stateComparator = (a,b)-> {
619
620 //TODO use real Term Comparator
621 if (!a.getTitleCache().equals(b.getTitleCache())){
622 return a.getTitleCache().compareTo(b.getTitleCache());
623 }
624 if (a.getUuid()!= b.getUuid()){
625 return a.getUuid().compareTo(b.getUuid());
626 }
627 return 0;
628 };
629
630 private static final Comparator<? super Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>>> entryComparator = (a,b)-> {
631 if (a.getKey().size()!=b.getKey().size()){
632 //order by number of taxa covered
633 return b.getKey().size() - a.getKey().size();
634 }else if (a.getValue().size()!= b.getValue().size()){
635 //order by number of states covered
636 return b.getValue().size() - a.getValue().size();
637 }else{
638 //order states alphabetically or by uuid
639 for (int i = 0; i < a.getValue().size(); i++){
640 DefinedTermBase<?> stateA = a.getValue().get(i);
641 DefinedTermBase<?> stateB = b.getValue().get(i);
642 int result = stateComparator.compare(stateA, stateB);
643 if (result != 0){
644 return result;
645 }
646 }
647 //TODO compare keys (sets of KeyTaxon)
648 // for (int i = 0; i < a.getKey().size(); i++){
649 // Object stateA = a.getKey().getUuid;
650 // State stateB = a.getKey().get(i);
651 // //TODO use real Term Comparator
652 // if (stateA.getUuid()!= stateB.getUuid()){
653 // return stateA.getUuid().compareTo(stateB.getUuid());
654 // }
655 // }
656 throw new RuntimeException("Compare for same state lists with different unit descriptions not yet implemented");
657 }
658 };
659
660 private Set<DefinedTermBase<?>> getAllStates(Feature feature, Set<KeyTaxon> taxaCovered, Set<DefinedTermBase<?>> filter) {
661 //TODO handle modifier
662 Set<DefinedTermBase<?>> states = new HashSet<>();
663 for (KeyTaxon taxon : taxaCovered){
664 Set<CategoricalData> cdSet = taxon.getCategoricalData(feature);
665 for (CategoricalData cd : cdSet){
666 List<StateData> stateDatas = cd.getStateData();
667 for (StateData sd : stateDatas){
668 DefinedTermBase<?> state = sd.getState();
669 if (filter != null && !filter.contains(state)) {
670 continue;
671 }
672 states.add(state);
673 }
674 }
675 }
676 return states;
677 }
678
679 private void handleQuantitativeData(PolytomousKeyNode parent, List<Feature> featuresLeft,
680 Set<KeyTaxon> taxaCovered, Map<Feature, BigDecimal> quantitativeFeaturesThresholds,
681 Feature winnerFeature, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
682
683 // first, get the threshold
684 BigDecimal threshold = quantitativeFeaturesThresholds.get(winnerFeature);
685 //TODO unit not seems to be used yet
686 StringBuilder unit = new StringBuilder();
687 // then determine which taxa are before and which are after this threshold (dichotomy)
688 //in order to create the children of the parent node
689 List<Set<KeyTaxon>> quantitativeStates = determineQuantitativeStates(threshold, winnerFeature, taxaCovered, unit);
690 // thus the list contains two sets of KeyTaxon, the first corresponding to
691 // those before, the second to those after the threshold
692 for (int i=0; i<2; i++) {
693 handleQuantitativeBranch(parent, featuresLeft, taxaCovered.size(), winnerFeature, threshold, unit,
694 quantitativeStates, featureStatesFilter, i);
695 }
696
697 return;
698 }
699
700 /**
701 * Creates the branch for a quantitative feature.
702 * TODO if the quantitative feature has dependent features they are not yet handled
703 */
704 private void handleQuantitativeBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
705 int parentTaxaCoveredSize, Feature winnerFeature, BigDecimal threshold, StringBuilder unit,
706 List<Set<KeyTaxon>> quantitativeStates, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter,
707 int brunchNum) {
708
709 String sign;
710 Set<KeyTaxon> newTaxaCovered = quantitativeStates.get(brunchNum);
711 if (brunchNum==0){
712 sign = before; // the first element of the list corresponds to taxa before the threshold
713 } else {
714 sign = after; // the second to those after
715 }
716 if (newTaxaCovered.size()>0){
717 PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
718 parent.setFeature(winnerFeature);
719 KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold + unit);
720 childNode.setStatement(statement);
721 parent.addChild(childNode);
722 //TODO don't we need to check dependent features, they are not very likely for quantitative features, but still might exist as exception ...
723 boolean taxaAreDiscriminatedInThisStep = newTaxaCovered.size() < parentTaxaCoveredSize;
724 boolean childrenExist = taxaAreDiscriminatedInThisStep && (newTaxaCovered.size() > 1);
725 if (childrenExist){
726 buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
727 }else{
728 handleLeaf(childNode, newTaxaCovered);
729 }
730 }else{
731 //TODO do we need to check the 0 case, can this happen at all, shouldn't we throw a warning instead?
732 throw new RuntimeException("No taxa left on branch. This should probably not happen.");
733 }
734 return;
735 }
736
737 private Feature computeScores(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
738 Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
739
740 Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
741 dependenciesScores(scoreMap, featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
742 // the feature with the best score becomes the one corresponding to the current node
743 Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
744 // the feature is removed from the list of features available to build the next level of the tree
745 featuresLeft.remove(winnerFeature);
746 if (config.isDebug()){System.out.println(" ScoreMap: " + scoreMap);}
747 if (config.isDebug()){System.out.println(" quantitativeThreshold: " + quantitativeFeaturesThresholds);}
748 if (config.isDebug()){System.out.println(" winner: " + winnerFeature);}
749 return winnerFeature;
750 }
751
752 /**
753 * Dependencies can be seen as a hierarchy.
754 * If the dependencies are on, this function calculates the score of all children features and give the highest score
755 * of these to their parent (of course, if its score is lower). This way, if a feature has a good score but is
756 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
757 */
758 private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,
759 Set<KeyTaxon> coveredTaxa, Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
760
761 //TODO maybe we need to do this recursive?
762
763 //old: but I don't understand why we should include the existing featureList, this is only necessary
764 //if the single cores depend on the set of features
765 //List<Feature> pseudoFeaturesLeft = new ArrayList<>(featuresLeft); //copies the list of features
766
767 // then adds all the features which depend on features contained in the list
768 List<Feature> pseudoFeatures = new ArrayList<>();
769 for (Feature feature : featuresLeft){
770 if (featureDependencies.containsKey(feature)){
771 for (Feature dependendFeature : featureDependencies.get(feature)){
772 if (!pseudoFeatures.contains(dependendFeature)){
773 pseudoFeatures.add(dependendFeature);
774 }
775 }
776 }
777 }
778
779 if (!pseudoFeatures.isEmpty()){
780 // then calculates the scores of all features that have been added
781 Map<Feature,Float> newScoreMap = featureScores(pseudoFeatures, coveredTaxa, quantitativeFeaturesThresholds, featureStatesFilter);
782 for (Feature parentFeature : featureDependencies.keySet()){
783 if (scoreMap.containsKey(parentFeature)){
784 for (Feature dependendFeature : featureDependencies.get(parentFeature)){
785 if (newScoreMap.containsKey(dependendFeature)) {
786 // if a feature has a better than the "parent" feature it depends on
787 if (newScoreMap.get(dependendFeature) > scoreMap.get(parentFeature)){
788 // the "parent" feature gets its score in the original score map
789 scoreMap.put(parentFeature, newScoreMap.get(dependendFeature));
790 }
791 }
792 }
793 }
794 }
795 }
796 }
797
798
799 /**
800 * This function merges branches of the key belonging to the same clique
801 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
802 *
803 * @param clique the list of States linked together (i.e. if merged have the same score)
804 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
805 * @param reuseWinner
806 * @param filter
807 * @return <code>true</code>, if all taxa covered by the new branch include all states of the clique.
808 * <code>false</code> otherwise.
809 */
810 private void mergeBranches(List<DefinedTermBase<?>> clique, Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap,
811 Map<Set<KeyTaxon>, Boolean> reuseWinner, Set<DefinedTermBase<?>> filter){
812
813 boolean isExact = true;
814 if (clique.size()<=1){
815 return;
816 }
817 Map.Entry<Set<KeyTaxon>,List<DefinedTermBase<?>>> firstBranch = null;
818 List<Set<KeyTaxon>> tdToDelete = new ArrayList<>();
819 //TODO is the stateFilter needed here somehow?
820 for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> branch : taxonStatesMap.entrySet()){
821 boolean stateFound = false;
822 // looks for one state of the clique in this branch
823 for(DefinedTermBase<?> state : clique){
824 if (branch.getValue().contains(state)) {
825 stateFound = true;
826 break;
827 }
828 }
829 // if one state is found...
830 if (stateFound == true){
831 // ...for the first time, the branch becomes the one to which the others will be merged
832 if (firstBranch == null){
833 firstBranch = branch;
834 }
835 // ... else the branch is merged to the first one.
836 else {
837 int oldSize = firstBranch.getKey().size();
838 firstBranch.getKey().addAll(branch.getKey());
839 int newSize = firstBranch.getKey().size();
840 if (oldSize != newSize || newSize != branch.getKey().size()){
841 isExact = false;
842 }
843 firstBranch.getValue().addAll(branch.getValue());
844 tdToDelete.add(branch.getKey());
845 }
846 }
847 }
848 // once this is done, the branches merged to the first one are deleted
849 for (Set<KeyTaxon> td : tdToDelete){
850 taxonStatesMap.remove(td);
851 }
852 if (!isExact && firstBranch != null){
853 reuseWinner.put(firstBranch.getKey(), Boolean.TRUE);
854 }
855 }
856
857 /**
858 * This function looks for the largest clique of States
859 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
860 * from the map of exclusions.
861 *
862 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
863 * @return
864 */
865 private List<DefinedTermBase<?>> returnBestClique (Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions){
866 int bestNumberOfExclusions=-1;
867 int numberOfExclusions;
868 List<DefinedTermBase<?>> clique = new ArrayList<>();
869
870 // looks for the largest clique, i.e. the state with less exclusions
871 DefinedTermBase<?> bestState=null;
872 for (Map.Entry<DefinedTermBase<?>,Set<DefinedTermBase<?>>> pair : exclusions.entrySet()){
873 numberOfExclusions = pair.getValue().size();
874 if ((bestNumberOfExclusions == -1) || numberOfExclusions < bestNumberOfExclusions) {
875 bestNumberOfExclusions = numberOfExclusions;
876 bestState = pair.getKey();
877 }
878 }
879
880 clique.add(bestState);
881 exclusions.remove(bestState);
882
883 boolean isNotExcluded;
884 // then starts building the clique by adding the states which do not exclude each other
885 for (Map.Entry<DefinedTermBase<?>,Set<DefinedTermBase<?>>> pair : exclusions.entrySet()){
886 isNotExcluded = true;
887 for (DefinedTermBase<?> state : clique) {
888 if (pair.getValue().contains(state)) {
889 isNotExcluded = false;
890 }
891 }
892 if (isNotExcluded){
893 clique.add(pair.getKey());
894 }
895 }
896 for (DefinedTermBase<?> state : clique) {
897 exclusions.remove(state);
898 }
899 return clique;
900 }
901
902
903 /**
904 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
905 *
906 * @param statesDone the list of states already done for this feature
907 * @param states2 the element from which the states are extracted
908 * @param feature the feature corresponding to the CategoricalData
909 * @param taxaCovered the base of taxa considered
910 * @param featureStatesFilter
911 * @return
912 */
913 private Map<Set<KeyTaxon>,List<DefinedTermBase<?>>> determineCategoricalStates(
914 Set<DefinedTermBase<?>> states, Feature feature, Set<KeyTaxon> taxaCovered, Set<DefinedTermBase<?>> filter){
915
916 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> childrenStatesMap = new HashMap<>();
917 //TODO needed
918 List<DefinedTermBase<?>> statesDone = new ArrayList<>(); // the list of states already used
919
920 for (DefinedTermBase<?> state : states){ // for each state
921 if (filter != null && !filter.contains(state)){
922 continue;
923 }
924 statesDone.add(state);
925 Set<KeyTaxon> newTaxaCovered = taxaByFeatureState(feature, state, taxaCovered); //gets which taxa present this state
926 List<DefinedTermBase<?>> statesOfTaxa = childrenStatesMap.get(newTaxaCovered);
927 if (statesOfTaxa == null) { // if no states are associated to these taxa, create a new list
928 statesOfTaxa = new ArrayList<>();
929 childrenStatesMap.put(newTaxaCovered, statesOfTaxa);
930 }
931 statesOfTaxa.add(state); // then add the state to the list
932 }
933 return childrenStatesMap;
934 }
935
936
937 /**
938 * Returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
939 */
940 private Set<KeyTaxon> taxaByFeatureState(Feature feature, DefinedTermBase<?> featureState, Set<KeyTaxon> taxaCovered){
941 Set<KeyTaxon> newCoveredTaxa = new HashSet<>();
942 for (KeyTaxon td : taxaCovered){
943 for (CategoricalData cd : td.getCategoricalData(feature)){
944 List<StateData> stateDatas = cd.getStateData();
945 for (StateData sd : stateDatas) {
946 if (sd.getState().equals(featureState)){
947 newCoveredTaxa.add(td);
948 }
949 }
950 }
951 }
952 return newCoveredTaxa;
953 }
954
955 /**
956 * This function returns the feature with the highest score. However, if several features have the same score
957 * the one which leads to less options is chosen (this way, the key is easier to read).
958 */
959 private Feature lessStatesWinner(Map<Feature,Float> scores, Set<KeyTaxon> taxaCovered){
960 int nTaxa = taxaCovered.size();
961 if (nTaxa==1) {
962 return null;
963 }
964 float meanScore = defaultMeanScore(nTaxa);
965 float bestScore = nTaxa*nTaxa;
966 List<Feature> bestFeatures = new ArrayList<>(); // if ever different features have the best score, they are put in this list
967 Feature bestFeature = null;
968 float newScore;
969 for (Map.Entry<Feature,Float> pair : scores.entrySet()){
970 if (pair.getValue()!=null){
971 newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
972 // a feature would have if it divided the taxa in two equal parts
973 if (newScore < bestScore){
974 bestFeatures.clear();
975 bestFeatures.add(pair.getKey());
976 bestScore = newScore;
977 }else if (newScore==bestScore){
978 bestFeatures.add(pair.getKey());
979 }
980 }
981 }
982 if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
983 return bestFeatures.get(0);
984 }
985 else { // else choose the one with less states
986 int lessStates = -1;
987 int numberOfDifferentStates=-1;
988 for (Feature feature : bestFeatures){
989 if (feature.isSupportsCategoricalData()){
990 Set<DefinedTermBase<?>> differentStates = new HashSet<>();
991 for (KeyTaxon taxon : taxaCovered){
992 Set<CategoricalData> cds = taxon.getCategoricalData(feature);
993 Set<StateData> allStateData = getStateData(cds);
994 for (StateData sd : allStateData) {
995 differentStates.add(sd.getState());
996 }
997 }
998 numberOfDifferentStates=differentStates.size();
999 }else if (feature.isSupportsQuantitativeData()){
1000 numberOfDifferentStates=2;
1001 }
1002 if (lessStates == -1 || numberOfDifferentStates < lessStates){
1003 lessStates = numberOfDifferentStates;
1004 bestFeature = feature;
1005 }
1006 }
1007 return bestFeature;
1008 }
1009 }
1010
1011 /**
1012 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
1013 * would have.
1014 */
1015 private float defaultMeanScore(int nTaxa){
1016 int i;
1017 float score=0;
1018 for (i=1;i<nTaxa;i++){
1019 score = score + Math.round(i+1/2);
1020 }
1021 return score;
1022 }
1023
1024 /**
1025 * This function fills the map of features (keys) with their respecting scores (values)
1026 */
1027 private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<KeyTaxon> coveredTaxa,
1028 Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
1029
1030 Map<Feature,Float> scoreMap = new HashMap<>();
1031 for (Feature feature : featuresLeft){
1032 if (feature.isSupportsCategoricalData()) {
1033 scoreMap.put(feature, categoricalFeatureScore(feature, coveredTaxa, featureStatesFilter.get(feature)));
1034 }
1035 if (feature.isSupportsQuantitativeData()){
1036 scoreMap.put(feature, quantitativeFeatureScore(feature, coveredTaxa, quantitativeFeaturesThresholds));
1037 }
1038 }
1039 return scoreMap;
1040 }
1041
1042 /**
1043 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
1044 * for a given quantitative feature, present either a lower or higher value than a given threshold.
1045 * It returns two Sets of {@link KeyTaxon}, one with the taxa under this threshold (taxaBefore) and another one
1046 * with the taxa over (taxaAfter).
1047 */
1048 private List<Set<KeyTaxon>> determineQuantitativeStates (BigDecimal threshold, Feature feature,
1049 Set<KeyTaxon> taxa, StringBuilder unit){
1050
1051 List<Set<KeyTaxon>> list = new ArrayList<>();
1052 Set<KeyTaxon> taxaBefore = new HashSet<>();
1053 Set<KeyTaxon> taxaAfter = new HashSet<>();
1054 list.add(taxaBefore);
1055 list.add(taxaAfter);
1056 for (KeyTaxon td : taxa){
1057 for (QuantitativeData qd : td.getQuantitativeData(feature)){
1058 if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
1059 unit.append(" " + qd.getUnit().getLabel());
1060 }
1061 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1062 for (StatisticalMeasurementValue smv : values){
1063 StatisticalMeasure type = smv.getType();
1064 //TODO DONT FORGET sample size, MEAN etc
1065 if (type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()) {
1066 if (smv.getValue().compareTo(threshold) > 0){
1067 taxaAfter.add(td);
1068 }
1069 }
1070 if (type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()) {
1071 if (smv.getValue().compareTo(threshold) <= 0){
1072 taxaBefore.add(td);
1073 }
1074 }
1075 }
1076 }
1077 }
1078 return list;
1079 }
1080
1081 /**
1082 * This function returns the score of a quantitative feature.
1083 */
1084 private float quantitativeFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa,
1085 Map<Feature,BigDecimal> quantitativeFeaturesThresholds){
1086
1087 List<BigDecimal> allValues = new ArrayList<>();
1088 for (KeyTaxon td : coveredTaxa){
1089 for (QuantitativeData qd : td.getQuantitativeData(feature)){
1090 computeAllValues(allValues, qd);
1091 }
1092 }
1093 int i,j;
1094 BigDecimal threshold = BigDecimal.ZERO;
1095 BigDecimal bestThreshold = BigDecimal.ZERO;
1096 int difference = allValues.size();
1097 int differenceMin = difference;
1098 int bestTaxaBefore = 0;
1099 int bestTaxaAfter = 0;
1100 for (i=0; i<allValues.size()/2; i++) {
1101 threshold = allValues.get(i*2+1);
1102 int taxaBefore=0;
1103 int taxaAfter=0;
1104 for (j=0;j<allValues.size()/2;j++) {
1105 if (allValues.get(j*2+1).compareTo(threshold) <= 0){
1106 taxaBefore++;
1107 }
1108 if (allValues.get(j*2).compareTo(threshold) > 0){
1109 taxaAfter++;
1110 }
1111 }
1112 difference = Math.abs(taxaBefore-taxaAfter);
1113 if (difference < differenceMin){
1114 differenceMin = difference;
1115 bestThreshold = threshold;
1116 bestTaxaBefore = taxaBefore;
1117 bestTaxaAfter = taxaAfter;
1118 }
1119 }
1120 quantitativeFeaturesThresholds.put(feature, bestThreshold);
1121 int defaultQuantitativeScore=0;
1122 for (i=0; i<bestTaxaBefore; i++) {
1123 defaultQuantitativeScore += bestTaxaAfter;
1124 }
1125 return defaultQuantitativeScore;
1126 }
1127
1128 private void computeAllValues(List<BigDecimal> allValues, QuantitativeData qd) {
1129 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1130 BigDecimal lowerboundary = null;
1131 BigDecimal upperboundary = null;
1132 for (StatisticalMeasurementValue smv : values){
1133 StatisticalMeasure type = smv.getType();
1134 BigDecimal value = smv.getValue();
1135 // DONT FORGET sample size, MEAN etc
1136 if(type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()){
1137 if (lowerboundary == null){
1138 lowerboundary = value;
1139 }else{
1140 lowerboundary = lowerboundary.min(value);
1141 }
1142 }
1143 if(type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()){
1144 if (upperboundary == null){
1145 upperboundary = value;
1146 }else{
1147 upperboundary = upperboundary.max(value);
1148 }
1149 }
1150
1151 // if (type.isMax()) {
1152 // upperboundary = smv.getValue();
1153 // upperboundarypresent=true;
1154 // }else if (type.equals(StatisticalMeasure.MIN())) {
1155 // lowerboundary = smv.getValue();
1156 // lowerboundarypresent=true;
1157 // }else if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
1158 // upperboundary = smv.getValue();
1159 // upperboundarypresent=true;
1160 // }else if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
1161 // lowerboundary = smv.getValue();
1162 // lowerboundarypresent=true;
1163 // }else if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
1164 // lowerboundary = smv.getValue();
1165 // upperboundary = lowerboundary;
1166 // lowerboundarypresent=true;
1167 // upperboundarypresent=true;
1168 // }
1169 }
1170 if (lowerboundary != null && upperboundary != null) {
1171 allValues.add(lowerboundary);
1172 allValues.add(upperboundary);
1173 }else if(lowerboundary != null || upperboundary != null){
1174 logger.warn("Only one of upper or lower boundary is defined. Statistical measurement value not used.");
1175 }
1176 }
1177
1178 /**
1179 * This function returns the score of a categorical feature
1180 * by comparing each taxon with each other. If the feature
1181 * discriminates a single pair of taxa the score is increased.
1182 */
1183 private float categoricalFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa, Set<DefinedTermBase<?>> filter){
1184 int i,j;
1185 float score =0;
1186 float power=0;
1187 KeyTaxon[] coveredTaxaArray = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1188 for (i=0 ; i<coveredTaxaArray.length; i++){
1189 Set<CategoricalData> cd1 = coveredTaxaArray[i].getCategoricalData(feature);
1190 for (j=i+1 ; j< coveredTaxaArray.length ; j++){
1191 Set<CategoricalData> cd2 = coveredTaxaArray[j].getCategoricalData(feature);
1192 power = defaultCategoricalPower(cd1, cd2, filter);
1193 score = score + power;
1194 }
1195 }
1196 return score;
1197 }
1198
1199 /**
1200 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
1201 */
1202 private void createDependencies(TermNode<Feature> node){
1203
1204 //the featureDependencies handling was originally in defaultCategoricalPower(cat, cat)
1205 //needs to be checked if it is OK to handle them here
1206 if (node.isDependent()){
1207 Feature parentFeature = node.getParent().getTerm();
1208 if (!featureDependencies.containsKey(parentFeature)){
1209 featureDependencies.put(parentFeature, new HashSet<>());
1210 }
1211 for (FeatureState featureState : node.getOnlyApplicableIf()){
1212 if (!oAifDependencies.containsKey(featureState)) {
1213 oAifDependencies.put(featureState, new HashSet<>());
1214 }
1215 oAifDependencies.get(featureState).add(node.getTerm());
1216 //TODO: we only guess that it is the state of the parent feature here
1217 //needs to be improved
1218 featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1219 }
1220 for (FeatureState featureState : node.getInapplicableIf()){
1221 if (!iAifDependencies.containsKey(featureState)) {
1222 iAifDependencies.put(featureState, new HashSet<>());
1223 }
1224 iAifDependencies.get(featureState).add(node.getTerm());
1225 //TODO: we only guess that it is the state of the parent feature here
1226 //needs to be improved
1227 featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1228 }
1229 }
1230
1231 for (TermNode<Feature> fn : node.getChildNodes()){
1232 createDependencies(fn);
1233 }
1234 // System.out.println(featureDependencies);
1235 }
1236
1237 /**
1238 * This function fills the exclusions map.
1239 */
1240 private float computeExclusions(Feature feature, Set<KeyTaxon> coveredTaxa, Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions, Set<DefinedTermBase<?>> filter){
1241 //unclear what the score is fore here
1242 float score =0;
1243 float power=0;
1244 KeyTaxon[] fixedOrderTaxa = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1245 for (int i=0 ; i<fixedOrderTaxa.length; i++){
1246 Set<CategoricalData> cd1 = fixedOrderTaxa[i].getCategoricalData(feature);
1247
1248 for (int j=i+1 ; j< fixedOrderTaxa.length ; j++){
1249 Set<CategoricalData> cd2 = fixedOrderTaxa[j].getCategoricalData(feature);
1250
1251 // System.out.println(deb1 + "; " +deb2);
1252 power = defaultCategoricalPower(cd1, cd2, filter);
1253 score = score + power;
1254 if (power >= 1.0){ // if there is no state in common between deb1 and deb2
1255
1256 for (StateData statedata1 : getStateData(cd1)){
1257 DefinedTermBase<?> state1 = statedata1.getState();
1258 if (!exclusions.containsKey(state1)){
1259 exclusions.put(state1, new HashSet<>());
1260 }
1261 for (StateData statedata2 : getStateData(cd2)){
1262 DefinedTermBase<?> state2 = statedata2.getState();
1263 if (!exclusions.containsKey(state2)){
1264 exclusions.put(state2, new HashSet<>());
1265 }
1266 exclusions.get(state1).add(state2);
1267 exclusions.get(state2).add(state1);
1268 }
1269 }
1270 }
1271 }
1272 }
1273 return score;
1274 }
1275
1276 private Set<StateData> getStateData(Set<CategoricalData> cds) {
1277 Set<StateData> result = new HashSet<>();
1278 for (CategoricalData cd : cds){
1279 result.addAll(cd.getStateData());
1280 }
1281 return result;
1282 }
1283
1284 /**
1285 * Returns the score of a categorical feature.
1286 */
1287 private float defaultCategoricalPower(Set<CategoricalData> cd1, Set<CategoricalData> cd2, Set<DefinedTermBase<?>> filter){
1288 if (cd1 == null || cd2 == null ||cd1.isEmpty() || cd2.isEmpty()){
1289 return 0;
1290 }
1291
1292 //FIXME see defaultCategoricalPower_old for additional code on dependencies
1293 //which has been removed here for now but might be important
1294 //Now I moved it to #createDependencies. Therefore the below is maybe not needed
1295 //anymore but superfluent.
1296 //But the implementation at createDependencies is not fully correct yet
1297 //so I keep it here for now.
1298
1299 for (CategoricalData cd : cd1){
1300 if (!featureDependencies.containsKey(cd.getFeature())){
1301 featureDependencies.put(cd.getFeature(), new HashSet<>());
1302 }
1303 for (DefinedTermBase<?> state : getStates(cd, filter)){
1304 if (iAifDependencies.get(state)!=null) {
1305 featureDependencies.get(cd.getFeature()).addAll(iAifDependencies.get(state));
1306 }
1307 if (oAifDependencies.get(state)!=null) {
1308 featureDependencies.get(cd.getFeature()).addAll(oAifDependencies.get(state));
1309 }
1310 }
1311 }
1312
1313 //get all states of both categorical data
1314 Set<DefinedTermBase<?>> states = getStates(cd1, cd2, filter);
1315 if (states.size() == 0){
1316 return 0;
1317 }
1318
1319 int nDiscriminative = 0;
1320 for (DefinedTermBase<?> state : states){
1321 boolean hasState1 = hasState(state, cd1);
1322 boolean hasState2 = hasState(state, cd2);
1323 //if only 1 has the state than the state is discriminative
1324 if (! (hasState1&&hasState2)) {
1325 nDiscriminative++;
1326 }
1327 }
1328 float result = (float)nDiscriminative/states.size();
1329 return result;
1330 }
1331
1332 private boolean hasState(DefinedTermBase<?> state, Set<CategoricalData> cds) {
1333 boolean result = false;
1334 for (CategoricalData cd : cds){
1335 for (StateData stateData:cd.getStateData()){
1336 result |= state.equals(stateData.getState());
1337 }
1338 }
1339 return result;
1340 }
1341
1342 private Set<DefinedTermBase<?>> getStates(Set<CategoricalData> cdset1, Set<CategoricalData> cdset2, Set<DefinedTermBase<?>> filter) {
1343 Set<DefinedTermBase<?>> result = new HashSet<>();
1344 result.addAll(getStates(cdset1, filter));
1345 result.addAll(getStates(cdset2, filter));
1346 return result;
1347 }
1348
1349 private Set<DefinedTermBase<?>> getStates(Set<CategoricalData> cdset, Set<DefinedTermBase<?>> filter) {
1350 Set<DefinedTermBase<?>> result = new HashSet<>();
1351 for (CategoricalData cd : cdset){
1352 result.addAll(getStates(cd, filter));
1353 }
1354 return result;
1355 }
1356
1357 private Set<DefinedTermBase<?>> getStates(CategoricalData cd, Set<DefinedTermBase<?>> filter) {
1358 //TODO handle modifier
1359 Set<DefinedTermBase<?>> result = new HashSet<>();
1360 List<StateData> states = cd.getStateData();
1361 for (StateData stateData:states){
1362 DefinedTermBase<?> state = stateData.getState();
1363 if (filter != null && !filter.contains(state)){
1364 continue;
1365 }
1366 result.add(stateData.getState());
1367 }
1368 return result;
1369 }
1370
1371 //TODO keep as long as the handling of featureDependencies is not yet checked or handled in
1372 //the new method defaultCategoricalPower()
1373 private float defaultCategoricalPower_old(CategoricalData deb1, CategoricalData deb2){
1374 List<StateData> states1 = deb1.getStateData();
1375 List<StateData> states2 = deb2.getStateData();
1376 boolean bool = false;
1377 Iterator<StateData> stateData1Iterator = states1.iterator() ;
1378 // while (!bool && stateData1Iterator.hasNext()) {
1379 // Iterator<StateData> stateData2Iterator = states2.iterator() ;
1380 // StateData stateData1 = stateData1Iterator.next();
1381 // while (!bool && stateData2Iterator.hasNext()) {
1382 // bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
1383 // }
1384 // }
1385 // one point each time two taxa can be discriminated for a given feature
1386
1387 boolean checkFeature = false;
1388
1389 if (!featureDependencies.containsKey(deb1.getFeature())){
1390 featureDependencies.put(deb1.getFeature(), new HashSet<>());
1391 checkFeature = true;
1392 }
1393
1394 while (stateData1Iterator.hasNext()) {
1395 Iterator<StateData> stateData2Iterator = states2.iterator() ;
1396 StateData stateData1 = stateData1Iterator.next();
1397 DefinedTermBase<?> state1 = stateData1.getState();
1398 if (checkFeature){
1399 if (iAifDependencies.get(state1)!=null) {
1400 featureDependencies.get(deb1.getFeature()).addAll(iAifDependencies.get(state1));
1401 }
1402 if (oAifDependencies.get(state1)!=null) {
1403 featureDependencies.get(deb1.getFeature()).addAll(oAifDependencies.get(state1));
1404 }
1405 }
1406 while (stateData2Iterator.hasNext()) {
1407 StateData stateData2 = stateData2Iterator.next();
1408 DefinedTermBase<?> state2 = stateData2.getState();
1409 bool = bool || state1.equals(state2); // checks if the states are the same
1410 }
1411 }
1412
1413 if (bool) {
1414 return 0;
1415 } else {
1416 return 1;
1417 }
1418 }
1419 }