1 package eu
.etaxonomy
.cdm
.strategy
.generate
;
3 import java
.math
.BigDecimal
;
4 import java
.util
.ArrayList
;
5 import java
.util
.Arrays
;
6 import java
.util
.Collection
;
7 import java
.util
.Comparator
;
8 import java
.util
.HashMap
;
9 import java
.util
.HashSet
;
10 import java
.util
.Iterator
;
11 import java
.util
.List
;
13 import java
.util
.Map
.Entry
;
15 import java
.util
.UUID
;
17 import org
.apache
.logging
.log4j
.LogManager
;
18 import org
.apache
.logging
.log4j
.Logger
;
20 import eu
.etaxonomy
.cdm
.common
.BigDecimalUtil
;
21 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
22 import eu
.etaxonomy
.cdm
.model
.common
.Language
;
23 import eu
.etaxonomy
.cdm
.model
.description
.CategoricalData
;
24 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
25 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
26 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
27 import eu
.etaxonomy
.cdm
.model
.description
.FeatureState
;
28 import eu
.etaxonomy
.cdm
.model
.description
.KeyStatement
;
29 import eu
.etaxonomy
.cdm
.model
.description
.PolytomousKey
;
30 import eu
.etaxonomy
.cdm
.model
.description
.PolytomousKeyNode
;
31 import eu
.etaxonomy
.cdm
.model
.description
.QuantitativeData
;
32 import eu
.etaxonomy
.cdm
.model
.description
.SpecimenDescription
;
33 import eu
.etaxonomy
.cdm
.model
.description
.StateData
;
34 import eu
.etaxonomy
.cdm
.model
.description
.StatisticalMeasure
;
35 import eu
.etaxonomy
.cdm
.model
.description
.StatisticalMeasurementValue
;
36 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
37 import eu
.etaxonomy
.cdm
.model
.occurrence
.SpecimenOrObservationBase
;
38 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonNode
;
40 import eu
.etaxonomy
.cdm
.model
.term
.DefinedTermBase
;
41 import eu
.etaxonomy
.cdm
.model
.term
.TermNode
;
47 public class PolytomousKeyGenerator
{
49 private static final Logger logger
= LogManager
.getLogger(PolytomousKeyGenerator
.class);
52 * Strings used for generating the statements of the key.
54 private static final String before
="<";
55 private static final String after
=">";
56 private static final String separator
= " or ";
59 private PolytomousKeyGeneratorConfigurator config
;
61 private Map
<FeatureState
,Set
<Feature
>> iAifDependencies
= new HashMap
<>(); // map of a set of Features (value) inapplicables if a State (key) is present
62 private Map
<FeatureState
,Set
<Feature
>> oAifDependencies
= new HashMap
<>(); // map of a set of Features (value) only applicables if a State (key) is present
63 private Map
<Feature
,Set
<Feature
>> featureDependencies
= new HashMap
<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
65 private class KeyTaxon
{
68 private SpecimenOrObservationBase
<?
> specimen
;
69 private Map
<Feature
,Set
<CategoricalData
>> categoricalData
= new HashMap
<>();
70 private Map
<Feature
,Set
<QuantitativeData
>> quantitativeData
= new HashMap
<>();
71 private Set
<KeyTaxon
> children
= new HashSet
<>();
73 private Set
<CategoricalData
> getCategoricalData(Feature feature
){
74 return categoricalData
.get(feature
) == null?
new HashSet
<>():categoricalData
.get(feature
);
76 private Set
<QuantitativeData
> getQuantitativeData(Feature feature
){
77 return quantitativeData
.get(feature
) == null?
new HashSet
<>():quantitativeData
.get(feature
);
80 private void addDescription(DescriptionBase
<?
> db
) {
81 for (DescriptionElementBase deb
: db
.getElements()){
82 Feature feature
= deb
.getFeature();
83 if (deb
.isCharacterData()){
84 if (deb
.isInstanceOf(CategoricalData
.class)){
85 CategoricalData cd
= CdmBase
.deproxy(deb
, CategoricalData
.class);
86 if (categoricalData
.get(feature
)== null){
87 categoricalData
.put(feature
, new HashSet
<>());
89 categoricalData
.get(feature
).add(cd
);
90 }else if (deb
.isInstanceOf(QuantitativeData
.class)){
91 QuantitativeData qd
= CdmBase
.deproxy(deb
, QuantitativeData
.class);
92 if (quantitativeData
.get(feature
)== null){
93 quantitativeData
.put(feature
, new HashSet
<>());
95 quantitativeData
.get(feature
).add(qd
);
102 public String
toString() {
103 return taxon
.getTitleCache(); // + "KeyTaxon [uuid=" + uuid + (taxon != null? ", " + taxon.getTitleCache():"") + "]"; // uuid=" + uuid +
107 // *************************** METHODS ***************************************/
112 public PolytomousKey
invoke(PolytomousKeyGeneratorConfigurator config
){
114 throw new NullPointerException("PolytomousKeyGeneratorConfigurator must not be null");
116 this.config
= config
;
117 if (this.config
.isUseDependencies()){
118 createDependencies(config
.getDependenciesTree().getRoot());
120 PolytomousKey polytomousKey
= PolytomousKey
.NewInstance();
121 PolytomousKeyNode root
= polytomousKey
.getRoot();
122 @SuppressWarnings("unchecked")
123 Set
<KeyTaxon
> taxaCovered
= makeKeyTaxa((Set
)config
.getTaxonDescriptions());
124 taxaCovered
= replaceSingleRoot(taxaCovered
);
126 //filter if a feature is available only for certain states in this branche
127 Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
= new HashMap
<>();
128 //TODO what if size(taxaCovered) <= 1, is this covered by algo? Write test to check
129 buildBranches(root
, config
.getFeatures(), taxaCovered
, featureStatesFilter
);
130 return polytomousKey
;
134 * If the root taxon is a single taxon but has children
135 * it will be replaced by it's children.
137 private Set
<KeyTaxon
> replaceSingleRoot(Set
<KeyTaxon
> taxaCovered
) {
138 if (this.config
.isUseTaxonHierarchy() && taxaCovered
.size()==1
139 && !taxaCovered
.iterator().next().children
.isEmpty()){
140 return replaceSingleRoot(taxaCovered
.iterator().next().children
);
146 private Set
<KeyTaxon
> makeKeyTaxa(Set
<DescriptionBase
<?
>> descriptions
) {
148 Map
<UUID
,KeyTaxon
> taxonMap
= new HashMap
<>();
149 for (DescriptionBase
<?
> db
: descriptions
){
150 KeyTaxon taxon
= new KeyTaxon();
151 if (db
.isInstanceOf(TaxonDescription
.class)){
152 TaxonDescription td
= CdmBase
.deproxy(db
, TaxonDescription
.class);
153 taxon
.uuid
= td
.getTaxon().getUuid();
154 taxon
.taxon
= td
.getTaxon();
155 }else if (db
.isInstanceOf(SpecimenDescription
.class)){
156 SpecimenDescription sd
= CdmBase
.deproxy(db
, SpecimenDescription
.class);
157 taxon
.uuid
= sd
.getDescribedSpecimenOrObservation().getUuid();
158 taxon
.specimen
= sd
.getDescribedSpecimenOrObservation();
160 throw new RuntimeException("Unhandled entity type " + db
.getClass().getName());
162 if (taxonMap
.get(taxon
.uuid
)!= null){
163 taxon
= taxonMap
.get(taxon
.uuid
);
165 taxonMap
.put(taxon
.uuid
, taxon
);
167 taxon
.addDescription(db
);
170 createTaxonHierarchy(taxonMap
);
172 return new HashSet
<>(taxonMap
.values());
175 private void createTaxonHierarchy(Map
<UUID
, KeyTaxon
> taxonMap
) {
176 if(config
.isUseTaxonHierarchy()==false){
179 Set
<KeyTaxon
> taxaToTest
= new HashSet
<>(taxonMap
.values());
180 for(KeyTaxon taxon
:taxaToTest
){
181 KeyTaxon parent
= getBestTaxonParent(taxon
, taxaToTest
);
183 parent
.children
.add(taxon
);
184 taxonMap
.remove(taxon
.uuid
);
189 private KeyTaxon
getBestTaxonParent(KeyTaxon taxon
, Collection
<KeyTaxon
> values
) {
190 KeyTaxon parent
= null;
191 String parentIndex
= "";
192 String myTreeIndex
= getTaxonTreeIndex(taxon
);
193 if (myTreeIndex
!= null) {
194 for (KeyTaxon candidate
:values
){
195 String candidateIndex
= getTaxonTreeIndex(candidate
);
196 if (candidateIndex
== null || myTreeIndex
.equals(candidateIndex
)){
199 if (myTreeIndex
.startsWith(candidateIndex
)){
200 if (candidateIndex
.length()> parentIndex
.length()){
202 parentIndex
= candidateIndex
;
210 private String
getTaxonTreeIndex(KeyTaxon taxon
) {
211 if (taxon
.taxon
.getTaxonNodes().isEmpty()){
214 //TODOO size>1 or classification check
215 TaxonNode node
= taxon
.taxon
.getTaxonNodes().iterator().next();
216 String treeIndex
= node
.treeIndex();
217 if (treeIndex
== null){
218 //unpersisted, this should only happen during test, create provisional treeindex
219 treeIndex
= getParentTreeIndex(node
) + node
.getUuid().toString() + "#" ;
224 private String
getParentTreeIndex(TaxonNode node
) {
225 TaxonNode parent
= node
.getParent();
226 if (parent
== null ){
229 return getParentTreeIndex(parent
) + parent
.getUuid().toString() + "#" ;
234 * Recursive function that builds the branches of the identification key
236 * @param parent the node considered
237 * @param featuresLeft List of features that can be used at this point
238 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
239 * @param featureStatesFilter
241 private void buildBranches(PolytomousKeyNode parent
, List
<Feature
> featuresLeft
, Set
<KeyTaxon
> taxaCovered
,
242 Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
){
244 //handle all branches taxa =>
245 Set
<KeyTaxon
> allBranchesTaxa
= getAllBranchesTaxa(featuresLeft
, taxaCovered
, featureStatesFilter
);
246 if (allBranchesTaxa
.size()>0){
247 if (allBranchesTaxa
.size()>1){
248 //TODO test if this case in handled and displayed correctly
249 logger
.warn(">1 final taxa in inner node");
251 taxaCovered
.removeAll(allBranchesTaxa
);
252 if(taxaCovered
.size() != 1){
253 handleLeaf(parent
, allBranchesTaxa
);
255 //if only 1 is left it is better to handle all remaining in sub-branch to make difference clearer
256 taxaCovered
.addAll(allBranchesTaxa
);
260 //start real branching
261 if (taxaCovered
.size()<=1){
263 logger
.warn("Only 1 or no taxon covered. This should currently only be possible on top level and is not yet handled. ");
265 // this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
266 Map
<Feature
,BigDecimal
> quantitativeFeaturesThresholds
= new HashMap
<>();
267 // the scores of the different features are calculated, the thresholds in the same time
268 if (config
.isDebug()){
269 System
.out
.println("Feature left: " + featuresLeft
+ ", taxa: " + taxaCovered
);
271 Feature winnerFeature
= computeScores(featuresLeft
, taxaCovered
, quantitativeFeaturesThresholds
, featureStatesFilter
);
273 if(winnerFeature
!= null){
274 /************** either the feature supports quantitative data... **************/
275 // NB: in this version, "quantitative features" are dealt with in a dichotomous way
276 if (winnerFeature
.isSupportsQuantitativeData()) {
277 featuresLeft
.add(winnerFeature
); //as quantitative data are currently only split in 2 parts there might be further splits possible and needed
278 handleQuantitativeData(parent
, featuresLeft
, taxaCovered
,
279 quantitativeFeaturesThresholds
, winnerFeature
, featureStatesFilter
);
281 /************** ...or it supports categorical data. **************/
282 else if (winnerFeature
.isSupportsCategoricalData()) {
283 handleCategorialFeature(parent
, featuresLeft
, taxaCovered
,
284 winnerFeature
, featureStatesFilter
);
286 throw new RuntimeException("Winner feature does not support character data.");
288 // the winner features are put back to the features left once the branch is done
289 if (!featuresLeft
.contains(winnerFeature
)){ //TODO why is a list and not a set?
290 featuresLeft
.add(winnerFeature
);
292 }else if (featuresLeft
.isEmpty()){
293 handleLeaf(parent
, taxaCovered
);
295 throw new RuntimeException("No winner feature but features left to handle should not happen.");
301 private Set
<KeyTaxon
> getAllBranchesTaxa(List
<Feature
> featuresLeft
, Set
<KeyTaxon
> taxaCovered
,
302 Map
<Feature
, Set
<eu
.etaxonomy
.cdm
.model
.term
.DefinedTermBase
<?
>>> featureStatesFilter
) {
304 Set
<KeyTaxon
> candidates
= new HashSet
<>(taxaCovered
);
305 List
<Feature
> dependendFeatures
= new ArrayList
<>();
306 for (Feature feature
: featuresLeft
){
307 if(feature
.isSupportsCategoricalData()){
308 Set
<DefinedTermBase
<?
>> allStates
= getAllStates(feature
, taxaCovered
, featureStatesFilter
.get(feature
));
309 Iterator
<KeyTaxon
> it
= candidates
.iterator();
310 while (it
.hasNext()){
311 Set
<KeyTaxon
> taxonSet
= new HashSet
<>(Arrays
.asList(it
.next()));
312 Set
<DefinedTermBase
<?
>> taxonStates
= getAllStates(feature
, taxonSet
, featureStatesFilter
.get(feature
));
313 if(allStates
.size() > taxonStates
.size()){
317 if(candidates
.isEmpty()){
320 addDependentFeatures(dependendFeatures
, feature
, new HashSet
<>(), new ArrayList
<>(allStates
));
322 }else if (feature
.isSupportsQuantitativeData()){
323 Iterator
<KeyTaxon
> it
= candidates
.iterator();
324 while (it
.hasNext()){
325 BigDecimal min
= BigDecimalUtil
.MAX_BIGDECIMAL
;
326 BigDecimal max
= BigDecimalUtil
.MIN_BIGDECIMAL
;
327 Set
<QuantitativeData
> qds
= it
.next().quantitativeData
.get(feature
);
328 qds
= qds
== null?
new HashSet
<>(): qds
;
329 for (QuantitativeData qd
: qds
){
330 BigDecimal qdMin
= qd
.getOverallMin();
332 min
= min
.min(qdMin
);
334 BigDecimal qdMax
= qd
.getOverallMax();
336 max
= max
.max(qdMax
);
339 boolean staysCandidate
= true;
340 for(KeyTaxon taxon
: taxaCovered
){
341 Set
<QuantitativeData
> tqds
= taxon
.quantitativeData
.get(feature
);
342 tqds
= tqds
== null?
new HashSet
<>(): tqds
;
343 for (QuantitativeData qd
: tqds
){
344 staysCandidate
&= qd
.getOverallMin() == null || qd
.getOverallMin().compareTo(min
) >= 0;
345 staysCandidate
&= qd
.getOverallMax() == null || qd
.getOverallMax().compareTo(max
) <= 0;
347 if (!staysCandidate
){
351 if (!staysCandidate
){
357 if(config
.isUseDependencies() && !dependendFeatures
.isEmpty() && !candidates
.isEmpty()){
358 Set
<KeyTaxon
> dependetCandidates
= getAllBranchesTaxa(dependendFeatures
, taxaCovered
, featureStatesFilter
);
359 candidates
.retainAll(dependetCandidates
);
365 * Creates a leaf. It adds the taxa the parent taxon as linked taxa. Handles a
366 * list of multiple taxa and handles "specimen taxa" (not yet fully implemented)
367 * @return taxa which exist in ALL sub-branches and therefore can be linked on higher level
369 private Set
<KeyTaxon
> handleLeaf(PolytomousKeyNode parent
, Set
<KeyTaxon
> taxaCovered
) {
370 KeyStatement parentStatement
= parent
.getStatement();
371 for(KeyTaxon taxon
: taxaCovered
){
372 if (taxon
.taxon
!= null){
373 parent
.setOrAddTaxon(taxon
.taxon
);
375 //FIXME handle other descriptions like specimen descriptions better
376 if (parentStatement
!=null){
377 String statementString
= parentStatement
.getLabelText(Language
.DEFAULT());
378 if (statementString
!=null && taxon
.specimen
!= null){
379 String label
= statementString
+ " --> " + taxon
.specimen
.getTitleCache();
380 parentStatement
.putLabel(Language
.DEFAULT(), label
);
389 * "categorical features" may present several different states/branches,
390 * each one of these might correspond to one child.
392 private void handleCategorialFeature(PolytomousKeyNode parent
, List
<Feature
> featuresLeft
,
393 Set
<KeyTaxon
> taxaCovered
,
394 Feature winnerFeature
,
395 Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
) {
397 Map
<Set
<KeyTaxon
>,Boolean
> reuseWinner
= new HashMap
<>();
399 Set
<DefinedTermBase
<?
>> allStates
= getAllStates(winnerFeature
, taxaCovered
, featureStatesFilter
.get(winnerFeature
));
400 // a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
401 // this key represents a single branch in the decision tree
402 Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
403 = determineCategoricalStates(allStates
, winnerFeature
, taxaCovered
, featureStatesFilter
.get(winnerFeature
));
405 if (taxonStatesMap
.size()<=1){
406 if (notEmpty(featureDependencies
.get(winnerFeature
))){
407 //TODO is empty list correctly handled here? Seems to happen if incorrect data (e.g. only Max values) for quantdata exist
408 List
<DefinedTermBase
<?
>> stateList
= taxonStatesMap
.isEmpty()?
new ArrayList
<>(): taxonStatesMap
.values().iterator().next();
409 Set
<Feature
> featuresAdded
= new HashSet
<>();
410 addDependentFeatures(featuresLeft
, winnerFeature
, featuresAdded
, stateList
);
411 featuresLeft
.remove(winnerFeature
);
412 buildBranches(parent
, featuresLeft
, taxaCovered
, featureStatesFilter
);
413 removeAddedDependendFeatures(featuresLeft
, featuresAdded
);
415 //if only 1 branch is left we can handle this as a leaf, no matter how many taxa are left
416 handleLeaf(parent
, taxaCovered
);
419 // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
420 if (config
.isMerge()){
421 taxonStatesMap
= handleMerge(taxaCovered
, winnerFeature
, reuseWinner
,
422 taxonStatesMap
, allStates
, featureStatesFilter
.get(winnerFeature
));
424 List
<Set
<KeyTaxon
>> sortedKeys
= sortKeys(taxonStatesMap
);
425 for (Set
<KeyTaxon
> newTaxaCovered
: sortedKeys
){
427 handleCategoricalBranch(parent
, featuresLeft
,
428 taxaCovered
.size(), winnerFeature
, reuseWinner
, taxonStatesMap
, newTaxaCovered
, featureStatesFilter
);
434 private Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> handleMerge(Set
<KeyTaxon
> taxaCovered
,
435 Feature winnerFeature
, Map
<Set
<KeyTaxon
>, Boolean
> reuseWinner
,
436 Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
, Set
<DefinedTermBase
<?
>> allStates
, Set
<DefinedTermBase
<?
>> filter
) {
438 // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
439 Map
<DefinedTermBase
<?
>,Set
<DefinedTermBase
<?
>>> exclusions
= new HashMap
<>();
440 computeExclusions(winnerFeature
, taxaCovered
, exclusions
, filter
);
442 while (!exclusions
.isEmpty()){
443 // looks for the largest clique, i.e. the state with less exclusions
444 List
<DefinedTermBase
<?
>> clique
= returnBestClique(exclusions
);
445 if(clique
.containsAll(allStates
)){
448 // then merges the corresponding branches
449 mergeBranches(clique
, taxonStatesMap
, reuseWinner
, filter
);
451 //during merge the keySet (set of taxa) may change, therefore they change their hashcode
452 //and can not be used as keys in the map anymore.
453 //Therefore we refill the map.
454 taxonStatesMap
= renewTaxonStatesMap(taxonStatesMap
);
455 return taxonStatesMap
;
458 private void handleCategoricalBranch(PolytomousKeyNode parent
, List
<Feature
> featuresLeft
,
460 Feature winnerFeature
, Map
<Set
<KeyTaxon
>, Boolean
> reuseWinner
,
461 Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
,
462 Set
<KeyTaxon
> newTaxaCovered
,
463 Map
<Feature
,Set
<DefinedTermBase
<?
>>> featureStatesFilter
) {
465 //to restore old state
466 Set
<DefinedTermBase
<?
>> oldFilterSet
= featureStatesFilter
.get(winnerFeature
);
467 Set
<Feature
> featuresAdded
= new HashSet
<>();
469 boolean areTheTaxaDiscriminated
= false;
470 PolytomousKeyNode childNode
= PolytomousKeyNode
.NewInstance();
471 parent
.addChild(childNode
);
473 List
<DefinedTermBase
<?
>> listOfStates
= taxonStatesMap
.get(newTaxaCovered
);
474 if ((newTaxaCovered
.size() > 0)){ //old: if the taxa are discriminated compared to those of the parent node, a child is created
475 areTheTaxaDiscriminated
= (newTaxaCovered
.size() != taxaCoveredSize
);
477 int numberOfStates
= listOfStates
.size()-1;
478 listOfStates
.sort(stateComparator
);
480 if (config
.isUseDependencies()){
481 // old: if the dependencies are considered, removes and adds the right features from/to the list of features left
482 // these features are stored in order to be put back again when the current branch is finished
484 addDependentFeatures(featuresLeft
, winnerFeature
, featuresAdded
, listOfStates
);
487 String statementLabel
= createStatement(listOfStates
, numberOfStates
);
488 KeyStatement statement
= KeyStatement
.NewInstance(statementLabel
);
489 childNode
.setStatement(statement
);
490 parent
.setFeature(winnerFeature
);
492 if (reuseWinner
.get(newTaxaCovered
)== Boolean
.TRUE
){
493 featuresLeft
.add(winnerFeature
);
494 setStatesFilter(featureStatesFilter
, winnerFeature
, listOfStates
);
496 featuresLeft
.remove(winnerFeature
);
500 boolean hasChildren
= areTheTaxaDiscriminated
&& (newTaxaCovered
.size() > 1);
502 buildBranches(childNode
, featuresLeft
, newTaxaCovered
, featureStatesFilter
);
504 handleLeaf(childNode
, newTaxaCovered
);
505 Set
<KeyTaxon
> taxonChildren
= getTaxonChildren(newTaxaCovered
);
506 if(!taxonChildren
.isEmpty()){
507 //TODO FIXME featuresLeft probably needs to include all features, similar for featureStatesFilter
508 buildBranches(childNode
, featuresLeft
, taxonChildren
, featureStatesFilter
);
512 //restore old state before returning to parent node
513 removeAddedDependendFeatures(featuresLeft
, featuresAdded
);
514 featureStatesFilter
.put(winnerFeature
, oldFilterSet
);
519 private Set
<KeyTaxon
> getTaxonChildren(Set
<KeyTaxon
> newTaxaCovered
) {
520 Set
<KeyTaxon
> result
= new HashSet
<>();
521 for (KeyTaxon taxon
:newTaxaCovered
){
522 result
.addAll(taxon
.children
);
527 private void setStatesFilter(Map
<Feature
, Set
<DefinedTermBase
<?
>>> filter
, Feature feature
,
528 List
<DefinedTermBase
<?
>> listOfStates
) {
529 if (filter
.get(feature
)==null){
530 filter
.put(feature
, new HashSet
<>(listOfStates
));
532 Set
<DefinedTermBase
<?
>> set
= filter
.get(feature
);
533 set
.retainAll(listOfStates
);
537 private void removeAddedDependendFeatures(List
<Feature
> featuresLeft
, Set
<Feature
> featuresAdded
) {
538 for (Feature addedFeature
: featuresAdded
) {
539 featuresLeft
.remove(addedFeature
);
543 private void addDependentFeatures(List
<Feature
> featuresLeft
, Feature baseFeature
,
544 Set
<Feature
> featuresAdded
, List
<DefinedTermBase
<?
>> listOfStates
) {
546 if(notEmpty(featureDependencies
.get(baseFeature
))){
547 Set
<Feature
> newFeatureCandidates
= new HashSet
<>(featureDependencies
.get(baseFeature
));
548 newFeatureCandidates
.remove(null);
549 for (DefinedTermBase
<?
> state
: listOfStates
) {
551 List
<Feature
> inapplicableFeatures
= getApplicableFeatures(baseFeature
, state
, iAifDependencies
);
552 newFeatureCandidates
.removeAll(inapplicableFeatures
);
554 List
<Feature
> onlyApplicableFeatures
= getApplicableFeatures(baseFeature
, state
, oAifDependencies
);
555 if (!onlyApplicableFeatures
.isEmpty()){
556 Iterator
<Feature
> it
= newFeatureCandidates
.iterator();
557 while (it
.hasNext()){
558 Feature featureCandidate
= it
.next();
559 if (!onlyApplicableFeatures
.contains(featureCandidate
)){
565 featuresLeft
.addAll(newFeatureCandidates
);
566 featuresAdded
.addAll(newFeatureCandidates
);
570 private List
<Feature
> getApplicableFeatures(Feature feature
, DefinedTermBase
<?
> state
,
571 Map
<FeatureState
, Set
<Feature
>> applicabilityDependencies
) {
572 List
<Feature
> result
= new ArrayList
<>();
573 for (FeatureState featureState
: applicabilityDependencies
.keySet()){
574 if(featureState
.getFeature().equals(feature
) && featureState
.getState().equals(state
)){
575 result
.addAll(applicabilityDependencies
.get(featureState
));
581 private boolean notEmpty(Set
<?
> set
) {
582 return (set
!= null) && !set
.isEmpty();
585 private String
createStatement(List
<DefinedTermBase
<?
>> listOfStates
, int numberOfStates
) {
586 StringBuilder statementLabel
= new StringBuilder();
587 for (DefinedTermBase
<?
> state
: listOfStates
) {
588 statementLabel
.append(state
.getLabel());
589 if (listOfStates
.lastIndexOf(state
)!=numberOfStates
){
590 statementLabel
.append(separator
); // append a separator after each state except for the last one
593 return statementLabel
.toString();
596 private Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> renewTaxonStatesMap(Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
) {
597 Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> result
= new HashMap
<>();
598 for (Map
.Entry
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> entry
: taxonStatesMap
.entrySet()){
599 result
.put(entry
.getKey(), entry
.getValue());
604 private List
<Set
<KeyTaxon
>> sortKeys(Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
) {
605 //for now this is a dummy sorting
606 List
<Map
.Entry
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>>> sortedEntries
= new ArrayList
<>();
607 sortedEntries
.addAll(taxonStatesMap
.entrySet());
609 sortedEntries
.sort(entryComparator
);
610 List
<Set
<KeyTaxon
>> result
= new ArrayList
<>();
611 for (Map
.Entry
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> entry
: sortedEntries
){
612 result
.add(entry
.getKey());
617 //TODO use a general term comparator here
618 private static final Comparator
<DefinedTermBase
<?
>> stateComparator
= (a
,b
)-> {
620 //TODO use real Term Comparator
621 if (!a
.getTitleCache().equals(b
.getTitleCache())){
622 return a
.getTitleCache().compareTo(b
.getTitleCache());
624 if (a
.getUuid()!= b
.getUuid()){
625 return a
.getUuid().compareTo(b
.getUuid());
630 private static final Comparator
<?
super Entry
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>>> entryComparator
= (a
,b
)-> {
631 if (a
.getKey().size()!=b
.getKey().size()){
632 //order by number of taxa covered
633 return b
.getKey().size() - a
.getKey().size();
634 }else if (a
.getValue().size()!= b
.getValue().size()){
635 //order by number of states covered
636 return b
.getValue().size() - a
.getValue().size();
638 //order states alphabetically or by uuid
639 for (int i
= 0; i
< a
.getValue().size(); i
++){
640 DefinedTermBase
<?
> stateA
= a
.getValue().get(i
);
641 DefinedTermBase
<?
> stateB
= b
.getValue().get(i
);
642 int result
= stateComparator
.compare(stateA
, stateB
);
647 //TODO compare keys (sets of KeyTaxon)
648 // for (int i = 0; i < a.getKey().size(); i++){
649 // Object stateA = a.getKey().getUuid;
650 // State stateB = a.getKey().get(i);
651 // //TODO use real Term Comparator
652 // if (stateA.getUuid()!= stateB.getUuid()){
653 // return stateA.getUuid().compareTo(stateB.getUuid());
656 throw new RuntimeException("Compare for same state lists with different unit descriptions not yet implemented");
660 private Set
<DefinedTermBase
<?
>> getAllStates(Feature feature
, Set
<KeyTaxon
> taxaCovered
, Set
<DefinedTermBase
<?
>> filter
) {
661 //TODO handle modifier
662 Set
<DefinedTermBase
<?
>> states
= new HashSet
<>();
663 for (KeyTaxon taxon
: taxaCovered
){
664 Set
<CategoricalData
> cdSet
= taxon
.getCategoricalData(feature
);
665 for (CategoricalData cd
: cdSet
){
666 List
<StateData
> stateDatas
= cd
.getStateData();
667 for (StateData sd
: stateDatas
){
668 DefinedTermBase
<?
> state
= sd
.getState();
669 if (filter
!= null && !filter
.contains(state
)) {
679 private void handleQuantitativeData(PolytomousKeyNode parent
, List
<Feature
> featuresLeft
,
680 Set
<KeyTaxon
> taxaCovered
, Map
<Feature
, BigDecimal
> quantitativeFeaturesThresholds
,
681 Feature winnerFeature
, Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
) {
683 // first, get the threshold
684 BigDecimal threshold
= quantitativeFeaturesThresholds
.get(winnerFeature
);
685 //TODO unit not seems to be used yet
686 StringBuilder unit
= new StringBuilder();
687 // then determine which taxa are before and which are after this threshold (dichotomy)
688 //in order to create the children of the parent node
689 List
<Set
<KeyTaxon
>> quantitativeStates
= determineQuantitativeStates(threshold
, winnerFeature
, taxaCovered
, unit
);
690 // thus the list contains two sets of KeyTaxon, the first corresponding to
691 // those before, the second to those after the threshold
692 for (int i
=0; i
<2; i
++) {
693 handleQuantitativeBranch(parent
, featuresLeft
, taxaCovered
.size(), winnerFeature
, threshold
, unit
,
694 quantitativeStates
, featureStatesFilter
, i
);
701 * Creates the branch for a quantitative feature.
702 * TODO if the quantitative feature has dependent features they are not yet handled
704 private void handleQuantitativeBranch(PolytomousKeyNode parent
, List
<Feature
> featuresLeft
,
705 int parentTaxaCoveredSize
, Feature winnerFeature
, BigDecimal threshold
, StringBuilder unit
,
706 List
<Set
<KeyTaxon
>> quantitativeStates
, Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
,
710 Set
<KeyTaxon
> newTaxaCovered
= quantitativeStates
.get(brunchNum
);
712 sign
= before
; // the first element of the list corresponds to taxa before the threshold
714 sign
= after
; // the second to those after
716 if (newTaxaCovered
.size()>0){
717 PolytomousKeyNode childNode
= PolytomousKeyNode
.NewInstance();
718 parent
.setFeature(winnerFeature
);
719 KeyStatement statement
= KeyStatement
.NewInstance( " " + sign
+ " " + threshold
+ unit
);
720 childNode
.setStatement(statement
);
721 parent
.addChild(childNode
);
722 //TODO don't we need to check dependent features, they are not very likely for quantitative features, but still might exist as exception ...
723 boolean taxaAreDiscriminatedInThisStep
= newTaxaCovered
.size() < parentTaxaCoveredSize
;
724 boolean childrenExist
= taxaAreDiscriminatedInThisStep
&& (newTaxaCovered
.size() > 1);
726 buildBranches(childNode
, featuresLeft
, newTaxaCovered
, featureStatesFilter
);
728 handleLeaf(childNode
, newTaxaCovered
);
731 //TODO do we need to check the 0 case, can this happen at all, shouldn't we throw a warning instead?
732 throw new RuntimeException("No taxa left on branch. This should probably not happen.");
737 private Feature
computeScores(List
<Feature
> featuresLeft
, Set
<KeyTaxon
> taxaCovered
,
738 Map
<Feature
,BigDecimal
> quantitativeFeaturesThresholds
, Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
) {
740 Map
<Feature
,Float
> scoreMap
= featureScores(featuresLeft
, taxaCovered
, quantitativeFeaturesThresholds
, featureStatesFilter
);
741 dependenciesScores(scoreMap
, featuresLeft
, taxaCovered
, quantitativeFeaturesThresholds
, featureStatesFilter
);
742 // the feature with the best score becomes the one corresponding to the current node
743 Feature winnerFeature
= lessStatesWinner(scoreMap
, taxaCovered
);
744 // the feature is removed from the list of features available to build the next level of the tree
745 featuresLeft
.remove(winnerFeature
);
746 if (config
.isDebug()){System
.out
.println(" ScoreMap: " + scoreMap
);}
747 if (config
.isDebug()){System
.out
.println(" quantitativeThreshold: " + quantitativeFeaturesThresholds
);}
748 if (config
.isDebug()){System
.out
.println(" winner: " + winnerFeature
);}
749 return winnerFeature
;
753 * Dependencies can be seen as a hierarchy.
754 * If the dependencies are on, this function calculates the score of all children features and give the highest score
755 * of these to their parent (of course, if its score is lower). This way, if a feature has a good score but is
756 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
758 private void dependenciesScores(Map
<Feature
,Float
> scoreMap
, List
<Feature
> featuresLeft
,
759 Set
<KeyTaxon
> coveredTaxa
, Map
<Feature
,BigDecimal
> quantitativeFeaturesThresholds
, Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
){
761 //TODO maybe we need to do this recursive?
763 //old: but I don't understand why we should include the existing featureList, this is only necessary
764 //if the single cores depend on the set of features
765 //List<Feature> pseudoFeaturesLeft = new ArrayList<>(featuresLeft); //copies the list of features
767 // then adds all the features which depend on features contained in the list
768 List
<Feature
> pseudoFeatures
= new ArrayList
<>();
769 for (Feature feature
: featuresLeft
){
770 if (featureDependencies
.containsKey(feature
)){
771 for (Feature dependendFeature
: featureDependencies
.get(feature
)){
772 if (!pseudoFeatures
.contains(dependendFeature
)){
773 pseudoFeatures
.add(dependendFeature
);
779 if (!pseudoFeatures
.isEmpty()){
780 // then calculates the scores of all features that have been added
781 Map
<Feature
,Float
> newScoreMap
= featureScores(pseudoFeatures
, coveredTaxa
, quantitativeFeaturesThresholds
, featureStatesFilter
);
782 for (Feature parentFeature
: featureDependencies
.keySet()){
783 if (scoreMap
.containsKey(parentFeature
)){
784 for (Feature dependendFeature
: featureDependencies
.get(parentFeature
)){
785 if (newScoreMap
.containsKey(dependendFeature
)) {
786 // if a feature has a better than the "parent" feature it depends on
787 if (newScoreMap
.get(dependendFeature
) > scoreMap
.get(parentFeature
)){
788 // the "parent" feature gets its score in the original score map
789 scoreMap
.put(parentFeature
, newScoreMap
.get(dependendFeature
));
800 * This function merges branches of the key belonging to the same clique
801 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
803 * @param clique the list of States linked together (i.e. if merged have the same score)
804 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
807 * @return <code>true</code>, if all taxa covered by the new branch include all states of the clique.
808 * <code>false</code> otherwise.
810 private void mergeBranches(List
<DefinedTermBase
<?
>> clique
, Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> taxonStatesMap
,
811 Map
<Set
<KeyTaxon
>, Boolean
> reuseWinner
, Set
<DefinedTermBase
<?
>> filter
){
813 boolean isExact
= true;
814 if (clique
.size()<=1){
817 Map
.Entry
<Set
<KeyTaxon
>,List
<DefinedTermBase
<?
>>> firstBranch
= null;
818 List
<Set
<KeyTaxon
>> tdToDelete
= new ArrayList
<>();
819 //TODO is the stateFilter needed here somehow?
820 for (Map
.Entry
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> branch
: taxonStatesMap
.entrySet()){
821 boolean stateFound
= false;
822 // looks for one state of the clique in this branch
823 for(DefinedTermBase
<?
> state
: clique
){
824 if (branch
.getValue().contains(state
)) {
829 // if one state is found...
830 if (stateFound
== true){
831 // ...for the first time, the branch becomes the one to which the others will be merged
832 if (firstBranch
== null){
833 firstBranch
= branch
;
835 // ... else the branch is merged to the first one.
837 int oldSize
= firstBranch
.getKey().size();
838 firstBranch
.getKey().addAll(branch
.getKey());
839 int newSize
= firstBranch
.getKey().size();
840 if (oldSize
!= newSize
|| newSize
!= branch
.getKey().size()){
843 firstBranch
.getValue().addAll(branch
.getValue());
844 tdToDelete
.add(branch
.getKey());
848 // once this is done, the branches merged to the first one are deleted
849 for (Set
<KeyTaxon
> td
: tdToDelete
){
850 taxonStatesMap
.remove(td
);
852 if (!isExact
&& firstBranch
!= null){
853 reuseWinner
.put(firstBranch
.getKey(), Boolean
.TRUE
);
858 * This function looks for the largest clique of States
859 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
860 * from the map of exclusions.
862 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
865 private List
<DefinedTermBase
<?
>> returnBestClique (Map
<DefinedTermBase
<?
>,Set
<DefinedTermBase
<?
>>> exclusions
){
866 int bestNumberOfExclusions
=-1;
867 int numberOfExclusions
;
868 List
<DefinedTermBase
<?
>> clique
= new ArrayList
<>();
870 // looks for the largest clique, i.e. the state with less exclusions
871 DefinedTermBase
<?
> bestState
=null;
872 for (Map
.Entry
<DefinedTermBase
<?
>,Set
<DefinedTermBase
<?
>>> pair
: exclusions
.entrySet()){
873 numberOfExclusions
= pair
.getValue().size();
874 if ((bestNumberOfExclusions
== -1) || numberOfExclusions
< bestNumberOfExclusions
) {
875 bestNumberOfExclusions
= numberOfExclusions
;
876 bestState
= pair
.getKey();
880 clique
.add(bestState
);
881 exclusions
.remove(bestState
);
883 boolean isNotExcluded
;
884 // then starts building the clique by adding the states which do not exclude each other
885 for (Map
.Entry
<DefinedTermBase
<?
>,Set
<DefinedTermBase
<?
>>> pair
: exclusions
.entrySet()){
886 isNotExcluded
= true;
887 for (DefinedTermBase
<?
> state
: clique
) {
888 if (pair
.getValue().contains(state
)) {
889 isNotExcluded
= false;
893 clique
.add(pair
.getKey());
896 for (DefinedTermBase
<?
> state
: clique
) {
897 exclusions
.remove(state
);
904 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
906 * @param statesDone the list of states already done for this feature
907 * @param states2 the element from which the states are extracted
908 * @param feature the feature corresponding to the CategoricalData
909 * @param taxaCovered the base of taxa considered
910 * @param featureStatesFilter
913 private Map
<Set
<KeyTaxon
>,List
<DefinedTermBase
<?
>>> determineCategoricalStates(
914 Set
<DefinedTermBase
<?
>> states
, Feature feature
, Set
<KeyTaxon
> taxaCovered
, Set
<DefinedTermBase
<?
>> filter
){
916 Map
<Set
<KeyTaxon
>, List
<DefinedTermBase
<?
>>> childrenStatesMap
= new HashMap
<>();
918 List
<DefinedTermBase
<?
>> statesDone
= new ArrayList
<>(); // the list of states already used
920 for (DefinedTermBase
<?
> state
: states
){ // for each state
921 if (filter
!= null && !filter
.contains(state
)){
924 statesDone
.add(state
);
925 Set
<KeyTaxon
> newTaxaCovered
= taxaByFeatureState(feature
, state
, taxaCovered
); //gets which taxa present this state
926 List
<DefinedTermBase
<?
>> statesOfTaxa
= childrenStatesMap
.get(newTaxaCovered
);
927 if (statesOfTaxa
== null) { // if no states are associated to these taxa, create a new list
928 statesOfTaxa
= new ArrayList
<>();
929 childrenStatesMap
.put(newTaxaCovered
, statesOfTaxa
);
931 statesOfTaxa
.add(state
); // then add the state to the list
933 return childrenStatesMap
;
938 * Returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
940 private Set
<KeyTaxon
> taxaByFeatureState(Feature feature
, DefinedTermBase
<?
> featureState
, Set
<KeyTaxon
> taxaCovered
){
941 Set
<KeyTaxon
> newCoveredTaxa
= new HashSet
<>();
942 for (KeyTaxon td
: taxaCovered
){
943 for (CategoricalData cd
: td
.getCategoricalData(feature
)){
944 List
<StateData
> stateDatas
= cd
.getStateData();
945 for (StateData sd
: stateDatas
) {
946 if (sd
.getState().equals(featureState
)){
947 newCoveredTaxa
.add(td
);
952 return newCoveredTaxa
;
956 * This function returns the feature with the highest score. However, if several features have the same score
957 * the one which leads to less options is chosen (this way, the key is easier to read).
959 private Feature
lessStatesWinner(Map
<Feature
,Float
> scores
, Set
<KeyTaxon
> taxaCovered
){
960 int nTaxa
= taxaCovered
.size();
964 float meanScore
= defaultMeanScore(nTaxa
);
965 float bestScore
= nTaxa
*nTaxa
;
966 List
<Feature
> bestFeatures
= new ArrayList
<>(); // if ever different features have the best score, they are put in this list
967 Feature bestFeature
= null;
969 for (Map
.Entry
<Feature
,Float
> pair
: scores
.entrySet()){
970 if (pair
.getValue()!=null){
971 newScore
= Math
.abs(pair
.getValue()-meanScore
);// the best score is the closest to the score (meanScore here)
972 // a feature would have if it divided the taxa in two equal parts
973 if (newScore
< bestScore
){
974 bestFeatures
.clear();
975 bestFeatures
.add(pair
.getKey());
976 bestScore
= newScore
;
977 }else if (newScore
==bestScore
){
978 bestFeatures
.add(pair
.getKey());
982 if (bestFeatures
.size()==1) { // if there is only one feature with the best score, pick it
983 return bestFeatures
.get(0);
985 else { // else choose the one with less states
987 int numberOfDifferentStates
=-1;
988 for (Feature feature
: bestFeatures
){
989 if (feature
.isSupportsCategoricalData()){
990 Set
<DefinedTermBase
<?
>> differentStates
= new HashSet
<>();
991 for (KeyTaxon taxon
: taxaCovered
){
992 Set
<CategoricalData
> cds
= taxon
.getCategoricalData(feature
);
993 Set
<StateData
> allStateData
= getStateData(cds
);
994 for (StateData sd
: allStateData
) {
995 differentStates
.add(sd
.getState());
998 numberOfDifferentStates
=differentStates
.size();
999 }else if (feature
.isSupportsQuantitativeData()){
1000 numberOfDifferentStates
=2;
1002 if (lessStates
== -1 || numberOfDifferentStates
< lessStates
){
1003 lessStates
= numberOfDifferentStates
;
1004 bestFeature
= feature
;
1012 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
1015 private float defaultMeanScore(int nTaxa
){
1018 for (i
=1;i
<nTaxa
;i
++){
1019 score
= score
+ Math
.round(i
+1/2);
1025 * This function fills the map of features (keys) with their respecting scores (values)
1027 private Map
<Feature
,Float
> featureScores(List
<Feature
> featuresLeft
, Set
<KeyTaxon
> coveredTaxa
,
1028 Map
<Feature
,BigDecimal
> quantitativeFeaturesThresholds
, Map
<Feature
, Set
<DefinedTermBase
<?
>>> featureStatesFilter
){
1030 Map
<Feature
,Float
> scoreMap
= new HashMap
<>();
1031 for (Feature feature
: featuresLeft
){
1032 if (feature
.isSupportsCategoricalData()) {
1033 scoreMap
.put(feature
, categoricalFeatureScore(feature
, coveredTaxa
, featureStatesFilter
.get(feature
)));
1035 if (feature
.isSupportsQuantitativeData()){
1036 scoreMap
.put(feature
, quantitativeFeatureScore(feature
, coveredTaxa
, quantitativeFeaturesThresholds
));
1043 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
1044 * for a given quantitative feature, present either a lower or higher value than a given threshold.
1045 * It returns two Sets of {@link KeyTaxon}, one with the taxa under this threshold (taxaBefore) and another one
1046 * with the taxa over (taxaAfter).
1048 private List
<Set
<KeyTaxon
>> determineQuantitativeStates (BigDecimal threshold
, Feature feature
,
1049 Set
<KeyTaxon
> taxa
, StringBuilder unit
){
1051 List
<Set
<KeyTaxon
>> list
= new ArrayList
<>();
1052 Set
<KeyTaxon
> taxaBefore
= new HashSet
<>();
1053 Set
<KeyTaxon
> taxaAfter
= new HashSet
<>();
1054 list
.add(taxaBefore
);
1055 list
.add(taxaAfter
);
1056 for (KeyTaxon td
: taxa
){
1057 for (QuantitativeData qd
: td
.getQuantitativeData(feature
)){
1058 if (unit
.toString().equals("") && qd
.getUnit()!=null && qd
.getUnit().getLabel()!=null){
1059 unit
.append(" " + qd
.getUnit().getLabel());
1061 Set
<StatisticalMeasurementValue
> values
= qd
.getStatisticalValues();
1062 for (StatisticalMeasurementValue smv
: values
){
1063 StatisticalMeasure type
= smv
.getType();
1064 //TODO DONT FORGET sample size, MEAN etc
1065 if (type
.isMax() || type
.isTypicalUpperBoundary() || type
.isAverage() || type
.isExactValue()) {
1066 if (smv
.getValue().compareTo(threshold
) > 0){
1070 if (type
.isMin() || type
.isTypicalLowerBoundary() || type
.isAverage() || type
.isExactValue()) {
1071 if (smv
.getValue().compareTo(threshold
) <= 0){
1082 * This function returns the score of a quantitative feature.
1084 private float quantitativeFeatureScore(Feature feature
, Set
<KeyTaxon
> coveredTaxa
,
1085 Map
<Feature
,BigDecimal
> quantitativeFeaturesThresholds
){
1087 List
<BigDecimal
> allValues
= new ArrayList
<>();
1088 for (KeyTaxon td
: coveredTaxa
){
1089 for (QuantitativeData qd
: td
.getQuantitativeData(feature
)){
1090 computeAllValues(allValues
, qd
);
1094 BigDecimal threshold
= BigDecimal
.ZERO
;
1095 BigDecimal bestThreshold
= BigDecimal
.ZERO
;
1096 int difference
= allValues
.size();
1097 int differenceMin
= difference
;
1098 int bestTaxaBefore
= 0;
1099 int bestTaxaAfter
= 0;
1100 for (i
=0; i
<allValues
.size()/2; i
++) {
1101 threshold
= allValues
.get(i
*2+1);
1104 for (j
=0;j
<allValues
.size()/2;j
++) {
1105 if (allValues
.get(j
*2+1).compareTo(threshold
) <= 0){
1108 if (allValues
.get(j
*2).compareTo(threshold
) > 0){
1112 difference
= Math
.abs(taxaBefore
-taxaAfter
);
1113 if (difference
< differenceMin
){
1114 differenceMin
= difference
;
1115 bestThreshold
= threshold
;
1116 bestTaxaBefore
= taxaBefore
;
1117 bestTaxaAfter
= taxaAfter
;
1120 quantitativeFeaturesThresholds
.put(feature
, bestThreshold
);
1121 int defaultQuantitativeScore
=0;
1122 for (i
=0; i
<bestTaxaBefore
; i
++) {
1123 defaultQuantitativeScore
+= bestTaxaAfter
;
1125 return defaultQuantitativeScore
;
1128 private void computeAllValues(List
<BigDecimal
> allValues
, QuantitativeData qd
) {
1129 Set
<StatisticalMeasurementValue
> values
= qd
.getStatisticalValues();
1130 BigDecimal lowerboundary
= null;
1131 BigDecimal upperboundary
= null;
1132 for (StatisticalMeasurementValue smv
: values
){
1133 StatisticalMeasure type
= smv
.getType();
1134 BigDecimal value
= smv
.getValue();
1135 // DONT FORGET sample size, MEAN etc
1136 if(type
.isMin() || type
.isTypicalLowerBoundary() || type
.isAverage() || type
.isExactValue()){
1137 if (lowerboundary
== null){
1138 lowerboundary
= value
;
1140 lowerboundary
= lowerboundary
.min(value
);
1143 if(type
.isMax() || type
.isTypicalUpperBoundary() || type
.isAverage() || type
.isExactValue()){
1144 if (upperboundary
== null){
1145 upperboundary
= value
;
1147 upperboundary
= upperboundary
.max(value
);
1151 // if (type.isMax()) {
1152 // upperboundary = smv.getValue();
1153 // upperboundarypresent=true;
1154 // }else if (type.equals(StatisticalMeasure.MIN())) {
1155 // lowerboundary = smv.getValue();
1156 // lowerboundarypresent=true;
1157 // }else if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
1158 // upperboundary = smv.getValue();
1159 // upperboundarypresent=true;
1160 // }else if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
1161 // lowerboundary = smv.getValue();
1162 // lowerboundarypresent=true;
1163 // }else if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
1164 // lowerboundary = smv.getValue();
1165 // upperboundary = lowerboundary;
1166 // lowerboundarypresent=true;
1167 // upperboundarypresent=true;
1170 if (lowerboundary
!= null && upperboundary
!= null) {
1171 allValues
.add(lowerboundary
);
1172 allValues
.add(upperboundary
);
1173 }else if(lowerboundary
!= null || upperboundary
!= null){
1174 logger
.warn("Only one of upper or lower boundary is defined. Statistical measurement value not used.");
1179 * This function returns the score of a categorical feature
1180 * by comparing each taxon with each other. If the feature
1181 * discriminates a single pair of taxa the score is increased.
1183 private float categoricalFeatureScore(Feature feature
, Set
<KeyTaxon
> coveredTaxa
, Set
<DefinedTermBase
<?
>> filter
){
1187 KeyTaxon
[] coveredTaxaArray
= coveredTaxa
.toArray(new KeyTaxon
[coveredTaxa
.size()]); // I did not figure a better way to do this
1188 for (i
=0 ; i
<coveredTaxaArray
.length
; i
++){
1189 Set
<CategoricalData
> cd1
= coveredTaxaArray
[i
].getCategoricalData(feature
);
1190 for (j
=i
+1 ; j
< coveredTaxaArray
.length
; j
++){
1191 Set
<CategoricalData
> cd2
= coveredTaxaArray
[j
].getCategoricalData(feature
);
1192 power
= defaultCategoricalPower(cd1
, cd2
, filter
);
1193 score
= score
+ power
;
1200 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
1202 private void createDependencies(TermNode
<Feature
> node
){
1204 //the featureDependencies handling was originally in defaultCategoricalPower(cat, cat)
1205 //needs to be checked if it is OK to handle them here
1206 if (node
.isDependent()){
1207 Feature parentFeature
= node
.getParent().getTerm();
1208 if (!featureDependencies
.containsKey(parentFeature
)){
1209 featureDependencies
.put(parentFeature
, new HashSet
<>());
1211 for (FeatureState featureState
: node
.getOnlyApplicableIf()){
1212 if (!oAifDependencies
.containsKey(featureState
)) {
1213 oAifDependencies
.put(featureState
, new HashSet
<>());
1215 oAifDependencies
.get(featureState
).add(node
.getTerm());
1216 //TODO: we only guess that it is the state of the parent feature here
1217 //needs to be improved
1218 featureDependencies
.get(node
.getParent().getTerm()).add(node
.getTerm());
1220 for (FeatureState featureState
: node
.getInapplicableIf()){
1221 if (!iAifDependencies
.containsKey(featureState
)) {
1222 iAifDependencies
.put(featureState
, new HashSet
<>());
1224 iAifDependencies
.get(featureState
).add(node
.getTerm());
1225 //TODO: we only guess that it is the state of the parent feature here
1226 //needs to be improved
1227 featureDependencies
.get(node
.getParent().getTerm()).add(node
.getTerm());
1231 for (TermNode
<Feature
> fn
: node
.getChildNodes()){
1232 createDependencies(fn
);
1234 // System.out.println(featureDependencies);
1238 * This function fills the exclusions map.
1240 private float computeExclusions(Feature feature
, Set
<KeyTaxon
> coveredTaxa
, Map
<DefinedTermBase
<?
>,Set
<DefinedTermBase
<?
>>> exclusions
, Set
<DefinedTermBase
<?
>> filter
){
1241 //unclear what the score is fore here
1244 KeyTaxon
[] fixedOrderTaxa
= coveredTaxa
.toArray(new KeyTaxon
[coveredTaxa
.size()]); // I did not figure a better way to do this
1245 for (int i
=0 ; i
<fixedOrderTaxa
.length
; i
++){
1246 Set
<CategoricalData
> cd1
= fixedOrderTaxa
[i
].getCategoricalData(feature
);
1248 for (int j
=i
+1 ; j
< fixedOrderTaxa
.length
; j
++){
1249 Set
<CategoricalData
> cd2
= fixedOrderTaxa
[j
].getCategoricalData(feature
);
1251 // System.out.println(deb1 + "; " +deb2);
1252 power
= defaultCategoricalPower(cd1
, cd2
, filter
);
1253 score
= score
+ power
;
1254 if (power
>= 1.0){ // if there is no state in common between deb1 and deb2
1256 for (StateData statedata1
: getStateData(cd1
)){
1257 DefinedTermBase
<?
> state1
= statedata1
.getState();
1258 if (!exclusions
.containsKey(state1
)){
1259 exclusions
.put(state1
, new HashSet
<>());
1261 for (StateData statedata2
: getStateData(cd2
)){
1262 DefinedTermBase
<?
> state2
= statedata2
.getState();
1263 if (!exclusions
.containsKey(state2
)){
1264 exclusions
.put(state2
, new HashSet
<>());
1266 exclusions
.get(state1
).add(state2
);
1267 exclusions
.get(state2
).add(state1
);
1276 private Set
<StateData
> getStateData(Set
<CategoricalData
> cds
) {
1277 Set
<StateData
> result
= new HashSet
<>();
1278 for (CategoricalData cd
: cds
){
1279 result
.addAll(cd
.getStateData());
1285 * Returns the score of a categorical feature.
1287 private float defaultCategoricalPower(Set
<CategoricalData
> cd1
, Set
<CategoricalData
> cd2
, Set
<DefinedTermBase
<?
>> filter
){
1288 if (cd1
== null || cd2
== null ||cd1
.isEmpty() || cd2
.isEmpty()){
1292 //FIXME see defaultCategoricalPower_old for additional code on dependencies
1293 //which has been removed here for now but might be important
1294 //Now I moved it to #createDependencies. Therefore the below is maybe not needed
1295 //anymore but superfluent.
1296 //But the implementation at createDependencies is not fully correct yet
1297 //so I keep it here for now.
1299 for (CategoricalData cd
: cd1
){
1300 if (!featureDependencies
.containsKey(cd
.getFeature())){
1301 featureDependencies
.put(cd
.getFeature(), new HashSet
<>());
1303 for (DefinedTermBase
<?
> state
: getStates(cd
, filter
)){
1304 if (iAifDependencies
.get(state
)!=null) {
1305 featureDependencies
.get(cd
.getFeature()).addAll(iAifDependencies
.get(state
));
1307 if (oAifDependencies
.get(state
)!=null) {
1308 featureDependencies
.get(cd
.getFeature()).addAll(oAifDependencies
.get(state
));
1313 //get all states of both categorical data
1314 Set
<DefinedTermBase
<?
>> states
= getStates(cd1
, cd2
, filter
);
1315 if (states
.size() == 0){
1319 int nDiscriminative
= 0;
1320 for (DefinedTermBase
<?
> state
: states
){
1321 boolean hasState1
= hasState(state
, cd1
);
1322 boolean hasState2
= hasState(state
, cd2
);
1323 //if only 1 has the state than the state is discriminative
1324 if (! (hasState1
&&hasState2
)) {
1328 float result
= (float)nDiscriminative
/states
.size();
1332 private boolean hasState(DefinedTermBase
<?
> state
, Set
<CategoricalData
> cds
) {
1333 boolean result
= false;
1334 for (CategoricalData cd
: cds
){
1335 for (StateData stateData
:cd
.getStateData()){
1336 result
|= state
.equals(stateData
.getState());
1342 private Set
<DefinedTermBase
<?
>> getStates(Set
<CategoricalData
> cdset1
, Set
<CategoricalData
> cdset2
, Set
<DefinedTermBase
<?
>> filter
) {
1343 Set
<DefinedTermBase
<?
>> result
= new HashSet
<>();
1344 result
.addAll(getStates(cdset1
, filter
));
1345 result
.addAll(getStates(cdset2
, filter
));
1349 private Set
<DefinedTermBase
<?
>> getStates(Set
<CategoricalData
> cdset
, Set
<DefinedTermBase
<?
>> filter
) {
1350 Set
<DefinedTermBase
<?
>> result
= new HashSet
<>();
1351 for (CategoricalData cd
: cdset
){
1352 result
.addAll(getStates(cd
, filter
));
1357 private Set
<DefinedTermBase
<?
>> getStates(CategoricalData cd
, Set
<DefinedTermBase
<?
>> filter
) {
1358 //TODO handle modifier
1359 Set
<DefinedTermBase
<?
>> result
= new HashSet
<>();
1360 List
<StateData
> states
= cd
.getStateData();
1361 for (StateData stateData
:states
){
1362 DefinedTermBase
<?
> state
= stateData
.getState();
1363 if (filter
!= null && !filter
.contains(state
)){
1366 result
.add(stateData
.getState());
1371 //TODO keep as long as the handling of featureDependencies is not yet checked or handled in
1372 //the new method defaultCategoricalPower()
1373 private float defaultCategoricalPower_old(CategoricalData deb1
, CategoricalData deb2
){
1374 List
<StateData
> states1
= deb1
.getStateData();
1375 List
<StateData
> states2
= deb2
.getStateData();
1376 boolean bool
= false;
1377 Iterator
<StateData
> stateData1Iterator
= states1
.iterator() ;
1378 // while (!bool && stateData1Iterator.hasNext()) {
1379 // Iterator<StateData> stateData2Iterator = states2.iterator() ;
1380 // StateData stateData1 = stateData1Iterator.next();
1381 // while (!bool && stateData2Iterator.hasNext()) {
1382 // bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
1385 // one point each time two taxa can be discriminated for a given feature
1387 boolean checkFeature
= false;
1389 if (!featureDependencies
.containsKey(deb1
.getFeature())){
1390 featureDependencies
.put(deb1
.getFeature(), new HashSet
<>());
1391 checkFeature
= true;
1394 while (stateData1Iterator
.hasNext()) {
1395 Iterator
<StateData
> stateData2Iterator
= states2
.iterator() ;
1396 StateData stateData1
= stateData1Iterator
.next();
1397 DefinedTermBase
<?
> state1
= stateData1
.getState();
1399 if (iAifDependencies
.get(state1
)!=null) {
1400 featureDependencies
.get(deb1
.getFeature()).addAll(iAifDependencies
.get(state1
));
1402 if (oAifDependencies
.get(state1
)!=null) {
1403 featureDependencies
.get(deb1
.getFeature()).addAll(oAifDependencies
.get(state1
));
1406 while (stateData2Iterator
.hasNext()) {
1407 StateData stateData2
= stateData2Iterator
.next();
1408 DefinedTermBase
<?
> state2
= stateData2
.getState();
1409 bool
= bool
|| state1
.equals(state2
); // checks if the states are the same