cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/generate/PolytomousKeyGenerator.java

   1 package eu.etaxonomy.cdm.strategy.generate;
   2
   3 import java.math.BigDecimal;
   4 import java.util.ArrayList;
   5 import java.util.Arrays;
   6 import java.util.Collection;
   7 import java.util.Comparator;
   8 import java.util.HashMap;
   9 import java.util.HashSet;
  10 import java.util.Iterator;
  11 import java.util.List;
  12 import java.util.Map;
  13 import java.util.Map.Entry;
  14 import java.util.Set;
  15 import java.util.UUID;
  16
  17 import org.apache.logging.log4j.LogManager;
  18 import org.apache.logging.log4j.Logger;
  19
  20 import eu.etaxonomy.cdm.common.BigDecimalUtil;
  21 import eu.etaxonomy.cdm.model.common.CdmBase;
  22 import eu.etaxonomy.cdm.model.common.Language;
  23 import eu.etaxonomy.cdm.model.description.CategoricalData;
  24 import eu.etaxonomy.cdm.model.description.DescriptionBase;
  25 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  26 import eu.etaxonomy.cdm.model.description.Feature;
  27 import eu.etaxonomy.cdm.model.description.FeatureState;
  28 import eu.etaxonomy.cdm.model.description.KeyStatement;
  29 import eu.etaxonomy.cdm.model.description.PolytomousKey;
  30 import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
  31 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  32 import eu.etaxonomy.cdm.model.description.SpecimenDescription;
  33 import eu.etaxonomy.cdm.model.description.StateData;
  34 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
  35 import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
  36 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  37 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
  38 import eu.etaxonomy.cdm.model.taxon.Taxon;
  39 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
  40 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
  41 import eu.etaxonomy.cdm.model.term.TermNode;
  42
  43 /**
  44  * @author m.venin
  45  * @author a.mueller
  46  */
  47 public class PolytomousKeyGenerator {
  48
  49     private static final Logger logger = LogManager.getLogger(PolytomousKeyGenerator.class);
  50
  51     /**
  52      * Strings used for generating the statements of the key.
  53      */
  54     private static final String before="<";
  55     private static final String after=">";
  56     private static final String separator = " or ";
  57
  58
  59     private PolytomousKeyGeneratorConfigurator config;
  60
  61         private Map<FeatureState,Set<Feature>> iAifDependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
  62         private Map<FeatureState,Set<Feature>> oAifDependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
  63         private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
  64
  65         private class KeyTaxon{
  66             private UUID uuid;
  67             private Taxon taxon;
  68             private SpecimenOrObservationBase<?> specimen;
  69             private Map<Feature,Set<CategoricalData>> categoricalData = new HashMap<>();
  70         private Map<Feature,Set<QuantitativeData>> quantitativeData = new HashMap<>();
  71         private Set<KeyTaxon> children = new HashSet<>();
  72
  73         private Set<CategoricalData> getCategoricalData(Feature feature){
  74             return categoricalData.get(feature) == null? new HashSet<>():categoricalData.get(feature);
  75         }
  76         private Set<QuantitativeData> getQuantitativeData(Feature feature){
  77             return quantitativeData.get(feature) == null? new HashSet<>():quantitativeData.get(feature);
  78         }
  79
  80         private void addDescription(DescriptionBase<?> db) {
  81             for (DescriptionElementBase deb : db.getElements()){
  82                 Feature feature = deb.getFeature();
  83                 if (deb.isCharacterData()){
  84                     if (deb.isInstanceOf(CategoricalData.class)){
  85                         CategoricalData cd = CdmBase.deproxy(deb, CategoricalData.class);
  86                         if (categoricalData.get(feature)== null){
  87                             categoricalData.put(feature, new HashSet<>());
  88                         }
  89                         categoricalData.get(feature).add(cd);
  90                     }else if (deb.isInstanceOf(QuantitativeData.class)){
  91                         QuantitativeData qd = CdmBase.deproxy(deb, QuantitativeData.class);
  92                         if (quantitativeData.get(feature)== null){
  93                             quantitativeData.put(feature, new HashSet<>());
  94                         }
  95                         quantitativeData.get(feature).add(qd);
  96                     }
  97                 }
  98             }
  99         }
 100
 101         @Override
 102         public String toString() {
 103             return taxon.getTitleCache(); // + "KeyTaxon [uuid=" + uuid + (taxon != null? ", " + taxon.getTitleCache():"") + "]";  // uuid=" + uuid +
 104         }
 105         }
 106
 107 // *************************** METHODS ***************************************/
 108
 109     /**
 110      * Creates the key
 111      */
 112     public PolytomousKey invoke(PolytomousKeyGeneratorConfigurator config){
 113         if (config == null){
 114             throw new NullPointerException("PolytomousKeyGeneratorConfigurator must not be null");
 115         }
 116         this.config = config;
 117         if (this.config.isUseDependencies()){
 118             createDependencies(config.getDependenciesTree().getRoot());
 119         }
 120         PolytomousKey polytomousKey = PolytomousKey.NewInstance();
 121         PolytomousKeyNode root = polytomousKey.getRoot();
 122         @SuppressWarnings("unchecked")
 123         Set<KeyTaxon> taxaCovered = makeKeyTaxa((Set)config.getTaxonDescriptions());
 124         taxaCovered = replaceSingleRoot(taxaCovered);
 125
 126         //filter if a feature is available only for certain states in this branche
 127         Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter = new HashMap<>();
 128         //TODO what if size(taxaCovered) <= 1, is this covered by algo? Write test to check
 129         buildBranches(root, config.getFeatures(), taxaCovered, featureStatesFilter);
 130         return polytomousKey;
 131     }
 132
 133     /**
 134      * If the root taxon is a single taxon but has children
 135      * it will be replaced by it's children.
 136      */
 137     private Set<KeyTaxon> replaceSingleRoot(Set<KeyTaxon> taxaCovered) {
 138         if (this.config.isUseTaxonHierarchy() && taxaCovered.size()==1
 139                 && !taxaCovered.iterator().next().children.isEmpty()){
 140             return replaceSingleRoot(taxaCovered.iterator().next().children);
 141         }else{
 142             return taxaCovered;
 143         }
 144     }
 145
 146     private Set<KeyTaxon> makeKeyTaxa(Set<DescriptionBase<?>> descriptions) {
 147
 148         Map<UUID,KeyTaxon> taxonMap = new HashMap<>();
 149         for (DescriptionBase<?> db : descriptions){
 150             KeyTaxon taxon = new KeyTaxon();
 151             if (db.isInstanceOf(TaxonDescription.class)){
 152                 TaxonDescription td = CdmBase.deproxy(db, TaxonDescription.class);
 153                 taxon.uuid = td.getTaxon().getUuid();
 154                 taxon.taxon = td.getTaxon();
 155             }else if (db.isInstanceOf(SpecimenDescription.class)){
 156                 SpecimenDescription sd = CdmBase.deproxy(db, SpecimenDescription.class);
 157                 taxon.uuid = sd.getDescribedSpecimenOrObservation().getUuid();
 158                 taxon.specimen = sd.getDescribedSpecimenOrObservation();
 159             }else{
 160                 throw new RuntimeException("Unhandled entity type " + db.getClass().getName());
 161             }
 162             if (taxonMap.get(taxon.uuid)!= null){
 163                 taxon = taxonMap.get(taxon.uuid);
 164             }else{
 165                 taxonMap.put(taxon.uuid, taxon);
 166             }
 167             taxon.addDescription(db);
 168         }
 169
 170         createTaxonHierarchy(taxonMap);
 171
 172         return new HashSet<>(taxonMap.values());
 173     }
 174
 175     private void createTaxonHierarchy(Map<UUID, KeyTaxon> taxonMap) {
 176         if(config.isUseTaxonHierarchy()==false){
 177             return;
 178         }
 179         Set<KeyTaxon> taxaToTest = new HashSet<>(taxonMap.values());
 180         for(KeyTaxon taxon:taxaToTest){
 181             KeyTaxon parent = getBestTaxonParent(taxon, taxaToTest);
 182             if (parent != null){
 183                 parent.children.add(taxon);
 184                 taxonMap.remove(taxon.uuid);
 185             }
 186         }
 187     }
 188
 189     private KeyTaxon getBestTaxonParent(KeyTaxon taxon, Collection<KeyTaxon> values) {
 190         KeyTaxon parent = null;
 191         String parentIndex = "";
 192         String myTreeIndex = getTaxonTreeIndex(taxon);
 193         if (myTreeIndex != null) {
 194             for (KeyTaxon candidate:values){
 195                 String candidateIndex = getTaxonTreeIndex(candidate);
 196                 if (candidateIndex == null || myTreeIndex.equals(candidateIndex)){
 197                     continue;
 198                 }
 199                 if (myTreeIndex.startsWith(candidateIndex)){
 200                     if (candidateIndex.length()> parentIndex.length()){
 201                         parent = candidate;
 202                         parentIndex = candidateIndex;
 203                     }
 204                 }
 205             }
 206         }
 207         return parent;
 208     }
 209
 210     private String getTaxonTreeIndex(KeyTaxon taxon) {
 211         if (taxon.taxon.getTaxonNodes().isEmpty()){
 212             return null;
 213         }
 214         //TODOO size>1  or classification check
 215         TaxonNode node = taxon.taxon.getTaxonNodes().iterator().next();
 216         String treeIndex = node.treeIndex();
 217         if (treeIndex == null){
 218             //unpersisted, this should only happen during test, create provisional treeindex
 219             treeIndex = getParentTreeIndex(node) + node.getUuid().toString() + "#" ;
 220         }
 221         return treeIndex;
 222     }
 223
 224     private String getParentTreeIndex(TaxonNode node) {
 225         TaxonNode parent = node.getParent();
 226         if (parent == null ){
 227             return "#";
 228         }else{
 229             return getParentTreeIndex(parent) + parent.getUuid().toString() + "#" ;
 230         }
 231     }
 232
 233     /**
 234          * Recursive function that builds the branches of the identification key
 235          *
 236          * @param parent the node considered
 237          * @param featuresLeft List of features that can be used at this point
 238          * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
 239      * @param featureStatesFilter
 240          */
 241         private void buildBranches(PolytomousKeyNode parent, List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
 242                 Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
 243
 244             //handle all branches taxa =>
 245         Set<KeyTaxon> allBranchesTaxa = getAllBranchesTaxa(featuresLeft, taxaCovered, featureStatesFilter);
 246         if (allBranchesTaxa.size()>0){
 247             if (allBranchesTaxa.size()>1){
 248             //TODO test if this case in handled and displayed correctly
 249                 logger.warn(">1 final taxa in inner node");
 250             }
 251             taxaCovered.removeAll(allBranchesTaxa);
 252             if(taxaCovered.size() != 1){
 253                 handleLeaf(parent, allBranchesTaxa);
 254             }else{
 255                 //if only 1 is left it is better to handle all remaining in sub-branch to make difference clearer
 256                 taxaCovered.addAll(allBranchesTaxa);
 257             }
 258         }
 259
 260         //start real branching
 261             if (taxaCovered.size()<=1){
 262                     //do nothing
 263                 logger.warn("Only 1 or no taxon covered. This should currently only be possible on top level and is not yet handled. ");
 264                 }else {
 265                         // this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
 266                         Map<Feature,BigDecimal> quantitativeFeaturesThresholds = new HashMap<>();
 267                         // the scores of the different features are calculated, the thresholds in the same time
 268                         if (config.isDebug()){
 269                             System.out.println("Feature left: " + featuresLeft + ", taxa: " + taxaCovered);
 270                         }
 271                         Feature winnerFeature = computeScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
 272
 273                         if(winnerFeature != null){
 274                             /************** either the feature supports quantitative data... **************/
 275                             // NB: in this version, "quantitative features" are dealt with in a dichotomous way
 276                             if (winnerFeature.isSupportsQuantitativeData()) {
 277                                 featuresLeft.add(winnerFeature);  //as quantitative data are currently only split in 2 parts there might be further splits possible and needed
 278                                 handleQuantitativeData(parent, featuresLeft, taxaCovered,
 279                                         quantitativeFeaturesThresholds, winnerFeature, featureStatesFilter);
 280                             }
 281                             /************** ...or it supports categorical data. **************/
 282                             else  if (winnerFeature.isSupportsCategoricalData()) {
 283                                 handleCategorialFeature(parent, featuresLeft, taxaCovered,
 284                                         winnerFeature, featureStatesFilter);
 285                             }else{
 286                         throw new RuntimeException("Winner feature does not support character data.");
 287                             }
 288                             // the winner features are put back to the features left once the branch is done
 289                             if (!featuresLeft.contains(winnerFeature)){  //TODO why is a list and not a set?
 290                                 featuresLeft.add(winnerFeature);
 291                             }
 292                         }else if (featuresLeft.isEmpty()){
 293                             handleLeaf(parent, taxaCovered);
 294                         }else{
 295                             throw new RuntimeException("No winner feature but features left to handle should not happen.");
 296                         }
 297                 }
 298         return;
 299         }
 300
 301     private Set<KeyTaxon> getAllBranchesTaxa(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
 302             Map<Feature, Set<eu.etaxonomy.cdm.model.term.DefinedTermBase<?>>> featureStatesFilter) {
 303
 304         Set<KeyTaxon> candidates = new HashSet<>(taxaCovered);
 305         List<Feature> dependendFeatures = new ArrayList<>();
 306         for (Feature feature : featuresLeft){
 307             if(feature.isSupportsCategoricalData()){
 308                 Set<DefinedTermBase<?>> allStates = getAllStates(feature, taxaCovered, featureStatesFilter.get(feature));
 309                 Iterator<KeyTaxon> it = candidates.iterator();
 310                 while (it.hasNext()){
 311                     Set<KeyTaxon> taxonSet = new HashSet<>(Arrays.asList(it.next()));
 312                     Set<DefinedTermBase<?>> taxonStates = getAllStates(feature, taxonSet, featureStatesFilter.get(feature));
 313                     if(allStates.size() > taxonStates.size()){
 314                         it.remove();
 315                     }
 316                 }
 317                 if(candidates.isEmpty()){
 318                     break;
 319                 }else{
 320                     addDependentFeatures(dependendFeatures, feature, new HashSet<>(), new ArrayList<>(allStates));
 321                 }
 322             }else if (feature.isSupportsQuantitativeData()){
 323                 Iterator<KeyTaxon> it = candidates.iterator();
 324                 while (it.hasNext()){
 325                     BigDecimal min = BigDecimalUtil.MAX_BIGDECIMAL;
 326                     BigDecimal max = BigDecimalUtil.MIN_BIGDECIMAL;
 327                     Set<QuantitativeData> qds = it.next().quantitativeData.get(feature);
 328                     qds = qds == null? new HashSet<>(): qds;
 329                     for (QuantitativeData qd : qds){
 330                         BigDecimal qdMin = qd.getOverallMin();
 331                         if(qdMin != null){
 332                             min = min.min(qdMin);
 333                         }
 334                         BigDecimal qdMax = qd.getOverallMax();
 335                         if(qdMax != null){
 336                             max = max.max(qdMax);
 337                         }
 338                     }
 339                     boolean staysCandidate = true;
 340                     for(KeyTaxon taxon : taxaCovered){
 341                         Set<QuantitativeData> tqds = taxon.quantitativeData.get(feature);
 342                         tqds = tqds == null? new HashSet<>(): tqds;
 343                         for (QuantitativeData qd : tqds){
 344                             staysCandidate &= qd.getOverallMin() == null || qd.getOverallMin().compareTo(min) >= 0;
 345                             staysCandidate &= qd.getOverallMax() == null || qd.getOverallMax().compareTo(max) <= 0;
 346                         }
 347                         if (!staysCandidate){
 348                             break;
 349                         }
 350                     }
 351                     if (!staysCandidate){
 352                         it.remove();
 353                     }
 354                 }
 355             }
 356         }
 357         if(config.isUseDependencies() && !dependendFeatures.isEmpty() && !candidates.isEmpty()){
 358             Set<KeyTaxon> dependetCandidates = getAllBranchesTaxa(dependendFeatures, taxaCovered, featureStatesFilter);
 359             candidates.retainAll(dependetCandidates);
 360         }
 361         return candidates;
 362     }
 363
 364     /**
 365      * Creates a leaf. It adds the taxa the parent taxon as linked taxa. Handles a
 366      * list of multiple taxa and handles "specimen taxa" (not yet fully implemented)
 367      * @return taxa which exist in ALL sub-branches and therefore can be linked on higher level
 368      */
 369         private Set<KeyTaxon> handleLeaf(PolytomousKeyNode parent, Set<KeyTaxon> taxaCovered) {
 370         KeyStatement parentStatement = parent.getStatement();
 371         for(KeyTaxon taxon: taxaCovered){
 372             if  (taxon.taxon != null){
 373                 parent.setOrAddTaxon(taxon.taxon);
 374             }else{
 375                 //FIXME handle other descriptions like specimen descriptions better
 376                 if (parentStatement!=null){
 377                     String statementString = parentStatement.getLabelText(Language.DEFAULT());
 378                     if (statementString !=null && taxon.specimen != null){
 379                         String label = statementString + " --> " + taxon.specimen.getTitleCache();
 380                         parentStatement.putLabel(Language.DEFAULT(), label);
 381                     }
 382                 }
 383             }
 384         }
 385         return taxaCovered;
 386     }
 387
 388     /**
 389      * "categorical features" may present several different states/branches,
 390      * each one of these might correspond to one child.
 391      */
 392     private void handleCategorialFeature(PolytomousKeyNode parent, List<Feature> featuresLeft,
 393             Set<KeyTaxon> taxaCovered,
 394             Feature winnerFeature,
 395             Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
 396
 397         Map<Set<KeyTaxon>,Boolean> reuseWinner = new HashMap<>();
 398
 399         Set<DefinedTermBase<?>> allStates = getAllStates(winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
 400                 // a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
 401         // this key represents a single branch in the decision tree
 402                 Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap
 403                         = determineCategoricalStates(allStates, winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
 404
 405                 if (taxonStatesMap.size()<=1){
 406                     if (notEmpty(featureDependencies.get(winnerFeature))){
 407                         //TODO is empty list correctly handled here? Seems to happen if incorrect data (e.g. only Max values) for quantdata exist
 408                         List<DefinedTermBase<?>> stateList = taxonStatesMap.isEmpty()? new ArrayList<>(): taxonStatesMap.values().iterator().next();
 409                         Set<Feature> featuresAdded = new HashSet<>();
 410                         addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, stateList);
 411                         featuresLeft.remove(winnerFeature);
 412                         buildBranches(parent, featuresLeft, taxaCovered, featureStatesFilter);
 413                         removeAddedDependendFeatures(featuresLeft, featuresAdded);
 414                     }else{
 415                         //if only 1 branch is left we can handle this as a leaf, no matter how many taxa are left
 416                         handleLeaf(parent, taxaCovered);
 417                     }
 418                 }else {
 419                     // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
 420                     if (config.isMerge()){
 421                         taxonStatesMap = handleMerge(taxaCovered, winnerFeature, reuseWinner,
 422                                 taxonStatesMap, allStates, featureStatesFilter.get(winnerFeature));
 423                     }
 424                     List<Set<KeyTaxon>> sortedKeys = sortKeys(taxonStatesMap);
 425             for (Set<KeyTaxon> newTaxaCovered : sortedKeys){
 426                         //handle each branch
 427                 handleCategoricalBranch(parent, featuresLeft,
 428                         taxaCovered.size(), winnerFeature, reuseWinner, taxonStatesMap, newTaxaCovered, featureStatesFilter);
 429             }
 430                 }
 431                 return;
 432     }
 433
 434     private Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> handleMerge(Set<KeyTaxon> taxaCovered,
 435             Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
 436             Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap, Set<DefinedTermBase<?>> allStates, Set<DefinedTermBase<?>> filter) {
 437
 438         // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
 439         Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions = new HashMap<>();
 440         computeExclusions(winnerFeature, taxaCovered, exclusions, filter);
 441
 442         while (!exclusions.isEmpty()){
 443                 // looks for the largest clique, i.e. the state with less exclusions
 444                 List<DefinedTermBase<?>> clique = returnBestClique(exclusions);
 445                 if(clique.containsAll(allStates)){
 446                     continue;
 447                 }
 448                 // then merges the corresponding branches
 449                 mergeBranches(clique, taxonStatesMap, reuseWinner, filter);
 450         }
 451         //during merge the keySet (set of taxa) may change, therefore they change their hashcode
 452         //and can not be used as keys in the map anymore.
 453         //Therefore we refill the map.
 454         taxonStatesMap = renewTaxonStatesMap(taxonStatesMap);
 455         return taxonStatesMap;
 456     }
 457
 458     private void handleCategoricalBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
 459             int taxaCoveredSize,
 460             Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
 461             Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap,
 462             Set<KeyTaxon> newTaxaCovered,
 463             Map<Feature,Set<DefinedTermBase<?>>> featureStatesFilter) {
 464
 465         //to restore old state
 466         Set<DefinedTermBase<?>> oldFilterSet = featureStatesFilter.get(winnerFeature);
 467         Set<Feature> featuresAdded = new HashSet<>();
 468
 469         boolean areTheTaxaDiscriminated = false;
 470         PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
 471         parent.addChild(childNode);
 472
 473         List<DefinedTermBase<?>> listOfStates = taxonStatesMap.get(newTaxaCovered);
 474         if ((newTaxaCovered.size() > 0)){ //old: if the taxa are discriminated compared to those of the parent node, a child is created
 475                 areTheTaxaDiscriminated = (newTaxaCovered.size() != taxaCoveredSize);
 476
 477                 int numberOfStates = listOfStates.size()-1;
 478                 listOfStates.sort(stateComparator);
 479
 480                 if (config.isUseDependencies()){
 481                     // old: if the dependencies are considered, removes and adds the right features from/to the list of features left
 482                 // these features are stored in order to be put back again when the current branch is finished
 483
 484                     addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, listOfStates);
 485                 }
 486
 487                 String statementLabel = createStatement(listOfStates, numberOfStates);
 488             KeyStatement statement = KeyStatement.NewInstance(statementLabel);
 489                 childNode.setStatement(statement);
 490                 parent.setFeature(winnerFeature);
 491
 492                 if (reuseWinner.get(newTaxaCovered)== Boolean.TRUE){
 493                     featuresLeft.add(winnerFeature);
 494                     setStatesFilter(featureStatesFilter, winnerFeature, listOfStates);
 495                 }else{
 496                     featuresLeft.remove(winnerFeature);
 497                 }
 498         }
 499
 500         boolean hasChildren = areTheTaxaDiscriminated && (newTaxaCovered.size() > 1);
 501         if (hasChildren){
 502             buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
 503         }else{
 504             handleLeaf(childNode, newTaxaCovered);
 505             Set<KeyTaxon> taxonChildren = getTaxonChildren(newTaxaCovered);
 506             if(!taxonChildren.isEmpty()){
 507                 //TODO FIXME featuresLeft probably needs to include all features, similar for featureStatesFilter
 508                 buildBranches(childNode, featuresLeft, taxonChildren, featureStatesFilter);
 509             }
 510         }
 511
 512         //restore old state before returning to parent node
 513         removeAddedDependendFeatures(featuresLeft, featuresAdded);
 514         featureStatesFilter.put(winnerFeature, oldFilterSet);
 515
 516         return;
 517     }
 518
 519     private Set<KeyTaxon> getTaxonChildren(Set<KeyTaxon> newTaxaCovered) {
 520         Set<KeyTaxon> result = new HashSet<>();
 521         for (KeyTaxon taxon:newTaxaCovered){
 522             result.addAll(taxon.children);
 523         }
 524         return result;
 525     }
 526
 527     private void setStatesFilter(Map<Feature, Set<DefinedTermBase<?>>> filter, Feature feature,
 528             List<DefinedTermBase<?>> listOfStates) {
 529         if (filter.get(feature)==null){
 530             filter.put(feature, new HashSet<>(listOfStates));
 531         }else{
 532             Set<DefinedTermBase<?>> set = filter.get(feature);
 533             set.retainAll(listOfStates);
 534         }
 535     }
 536
 537     private void removeAddedDependendFeatures(List<Feature> featuresLeft, Set<Feature> featuresAdded) {
 538         for (Feature addedFeature : featuresAdded) {
 539             featuresLeft.remove(addedFeature);
 540         }
 541     }
 542
 543     private void addDependentFeatures(List<Feature> featuresLeft, Feature baseFeature,
 544             Set<Feature> featuresAdded, List<DefinedTermBase<?>> listOfStates) {
 545
 546         if(notEmpty(featureDependencies.get(baseFeature))){
 547             Set<Feature> newFeatureCandidates = new HashSet<>(featureDependencies.get(baseFeature));
 548             newFeatureCandidates.remove(null);
 549             for (DefinedTermBase<?> state : listOfStates) {
 550                 //in-applicable
 551                 List<Feature> inapplicableFeatures = getApplicableFeatures(baseFeature, state, iAifDependencies);
 552                 newFeatureCandidates.removeAll(inapplicableFeatures);
 553                 //only-applicable
 554                 List<Feature> onlyApplicableFeatures = getApplicableFeatures(baseFeature, state, oAifDependencies);
 555                 if (!onlyApplicableFeatures.isEmpty()){
 556                     Iterator<Feature> it = newFeatureCandidates.iterator();
 557                     while (it.hasNext()){
 558                         Feature featureCandidate = it.next();
 559                         if (!onlyApplicableFeatures.contains(featureCandidate)){
 560                             it.remove();
 561                         }
 562                     }
 563                 }
 564             }
 565             featuresLeft.addAll(newFeatureCandidates);
 566             featuresAdded.addAll(newFeatureCandidates);
 567         }
 568     }
 569
 570     private List<Feature> getApplicableFeatures(Feature feature, DefinedTermBase<?> state,
 571             Map<FeatureState, Set<Feature>> applicabilityDependencies) {
 572         List<Feature> result = new ArrayList<>();
 573         for (FeatureState featureState : applicabilityDependencies.keySet()){
 574             if(featureState.getFeature().equals(feature) && featureState.getState().equals(state)){
 575                 result.addAll(applicabilityDependencies.get(featureState));
 576             }
 577         }
 578         return result;
 579     }
 580
 581     private boolean notEmpty(Set<?> set) {
 582         return (set != null) && !set.isEmpty();
 583     }
 584
 585     private String createStatement(List<DefinedTermBase<?>> listOfStates, int numberOfStates) {
 586         StringBuilder statementLabel = new StringBuilder();
 587         for (DefinedTermBase<?> state : listOfStates) {
 588             statementLabel.append(state.getLabel());
 589             if (listOfStates.lastIndexOf(state)!=numberOfStates){
 590                 statementLabel.append(separator); // append a separator after each state except for the last one
 591             }
 592         }
 593         return statementLabel.toString();
 594     }
 595
 596     private Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> renewTaxonStatesMap(Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap) {
 597         Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> result = new HashMap<>();
 598         for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> entry : taxonStatesMap.entrySet()){
 599             result.put(entry.getKey(), entry.getValue());
 600         }
 601         return result;
 602     }
 603
 604     private List<Set<KeyTaxon>> sortKeys(Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap) {
 605         //for now this is a dummy sorting
 606         List<Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>>> sortedEntries = new ArrayList<>();
 607         sortedEntries.addAll(taxonStatesMap.entrySet());
 608
 609         sortedEntries.sort(entryComparator);
 610         List<Set<KeyTaxon>> result = new ArrayList<>();
 611         for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> entry : sortedEntries){
 612             result.add(entry.getKey());
 613         }
 614         return result;
 615     }
 616
 617     //TODO use a general term comparator here
 618     private static final Comparator<DefinedTermBase<?>> stateComparator = (a,b)-> {
 619
 620         //TODO use real Term Comparator
 621         if (!a.getTitleCache().equals(b.getTitleCache())){
 622             return a.getTitleCache().compareTo(b.getTitleCache());
 623         }
 624         if (a.getUuid()!= b.getUuid()){
 625             return a.getUuid().compareTo(b.getUuid());
 626         }
 627         return 0;
 628     };
 629
 630     private static final Comparator<? super Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>>> entryComparator =  (a,b)-> {
 631         if (a.getKey().size()!=b.getKey().size()){
 632             //order by number of taxa covered
 633             return b.getKey().size() - a.getKey().size();
 634         }else if (a.getValue().size()!= b.getValue().size()){
 635             //order by number of states covered
 636             return b.getValue().size() - a.getValue().size();
 637         }else{
 638             //order states alphabetically or by uuid
 639             for (int i = 0; i < a.getValue().size(); i++){
 640                 DefinedTermBase<?> stateA = a.getValue().get(i);
 641                 DefinedTermBase<?> stateB = b.getValue().get(i);
 642                 int result = stateComparator.compare(stateA, stateB);
 643                 if (result != 0){
 644                     return result;
 645                 }
 646             }
 647             //TODO compare keys (sets of KeyTaxon)
 648 //            for (int i = 0; i < a.getKey().size(); i++){
 649 //                Object stateA = a.getKey().getUuid;
 650 //                State stateB = a.getKey().get(i);
 651 //                //TODO use real Term Comparator
 652 //                if (stateA.getUuid()!= stateB.getUuid()){
 653 //                    return stateA.getUuid().compareTo(stateB.getUuid());
 654 //                }
 655 //            }
 656             throw new RuntimeException("Compare for same state lists with different unit descriptions not yet implemented");
 657         }
 658     };
 659
 660     private Set<DefinedTermBase<?>> getAllStates(Feature feature, Set<KeyTaxon> taxaCovered, Set<DefinedTermBase<?>> filter) {
 661         //TODO handle modifier
 662         Set<DefinedTermBase<?>> states = new HashSet<>();
 663         for (KeyTaxon taxon : taxaCovered){
 664             Set<CategoricalData> cdSet = taxon.getCategoricalData(feature);
 665             for (CategoricalData cd : cdSet){
 666                 List<StateData> stateDatas = cd.getStateData();
 667                 for (StateData sd : stateDatas){
 668                     DefinedTermBase<?> state = sd.getState();
 669                     if (filter != null && !filter.contains(state)) {
 670                         continue;
 671                     }
 672                     states.add(state);
 673                 }
 674             }
 675         }
 676         return states;
 677     }
 678
 679     private void handleQuantitativeData(PolytomousKeyNode parent, List<Feature> featuresLeft,
 680             Set<KeyTaxon> taxaCovered, Map<Feature, BigDecimal> quantitativeFeaturesThresholds,
 681             Feature winnerFeature, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
 682
 683         // first, get the threshold
 684         BigDecimal threshold = quantitativeFeaturesThresholds.get(winnerFeature);
 685         //TODO unit not seems to be used yet
 686         StringBuilder unit = new StringBuilder();
 687         // then determine which taxa are before and which are after this threshold (dichotomy)
 688         //in order to create the children of the parent node
 689         List<Set<KeyTaxon>> quantitativeStates = determineQuantitativeStates(threshold, winnerFeature, taxaCovered, unit);
 690         // thus the list contains two sets of KeyTaxon, the first corresponding to
 691         // those before, the second to those after the threshold
 692         for (int i=0; i<2; i++) {
 693             handleQuantitativeBranch(parent, featuresLeft, taxaCovered.size(), winnerFeature, threshold, unit,
 694                     quantitativeStates, featureStatesFilter, i);
 695         }
 696
 697         return;
 698     }
 699
 700     /**
 701      * Creates the branch for a quantitative feature.
 702      * TODO if the quantitative feature has dependent features they are not yet handled
 703      */
 704     private void handleQuantitativeBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
 705             int parentTaxaCoveredSize, Feature winnerFeature, BigDecimal threshold, StringBuilder unit,
 706             List<Set<KeyTaxon>> quantitativeStates, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter,
 707             int brunchNum) {
 708
 709         String sign;
 710         Set<KeyTaxon> newTaxaCovered = quantitativeStates.get(brunchNum);
 711         if (brunchNum==0){
 712                 sign = before; // the first element of the list corresponds to taxa before the threshold
 713         } else {
 714                 sign = after; // the second to those after
 715         }
 716         if (newTaxaCovered.size()>0){
 717                 PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
 718                 parent.setFeature(winnerFeature);
 719                 KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold + unit);
 720                 childNode.setStatement(statement);
 721                 parent.addChild(childNode);
 722                 //TODO don't we need to check dependent features, they are not very likely for quantitative features, but still might exist as exception ...
 723                 boolean taxaAreDiscriminatedInThisStep = newTaxaCovered.size() < parentTaxaCoveredSize;
 724                 boolean childrenExist = taxaAreDiscriminatedInThisStep && (newTaxaCovered.size() > 1);
 725                 if (childrenExist){
 726                     buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
 727                 }else{
 728                     handleLeaf(childNode, newTaxaCovered);
 729                 }
 730         }else{
 731             //TODO do we need to check the 0 case, can this happen at all, shouldn't we throw a warning instead?
 732             throw new RuntimeException("No taxa left on branch. This should probably not happen.");
 733         }
 734         return;
 735     }
 736
 737     private Feature computeScores(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
 738             Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter) {
 739
 740         Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
 741         dependenciesScores(scoreMap, featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
 742         // the feature with the best score becomes the one corresponding to the current node
 743         Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
 744         // the feature is removed from the list of features available to build the next level of the tree
 745         featuresLeft.remove(winnerFeature);
 746         if (config.isDebug()){System.out.println("   ScoreMap: " + scoreMap);}
 747         if (config.isDebug()){System.out.println("   quantitativeThreshold: " + quantitativeFeaturesThresholds);}
 748         if (config.isDebug()){System.out.println("   winner: " + winnerFeature);}
 749         return winnerFeature;
 750     }
 751
 752         /**
 753          * Dependencies can be seen as a hierarchy.
 754          * If the dependencies are on, this function calculates the score of all children features and give the highest score
 755          * of these to their parent (of course, if its score is lower). This way, if a feature has a good score but is
 756          * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
 757          */
 758         private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,
 759                 Set<KeyTaxon> coveredTaxa, Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
 760
 761             //TODO maybe we need to do this recursive?
 762
 763             //old: but I don't understand why we should include the existing featureList, this is only necessary
 764             //if the single cores depend on the set of features
 765             //List<Feature> pseudoFeaturesLeft = new ArrayList<>(featuresLeft);  //copies the list of features
 766
 767             // then adds all the features which depend on features contained in the list
 768                 List<Feature> pseudoFeatures = new ArrayList<>();
 769                 for (Feature feature : featuresLeft){
 770                         if (featureDependencies.containsKey(feature)){
 771                                 for (Feature dependendFeature : featureDependencies.get(feature)){
 772                                     if (!pseudoFeatures.contains(dependendFeature)){
 773                                         pseudoFeatures.add(dependendFeature);
 774                                     }
 775                                 }
 776                         }
 777                 }
 778
 779                 if (!pseudoFeatures.isEmpty()){
 780                 // then calculates the scores of all features that have been added
 781                 Map<Feature,Float> newScoreMap = featureScores(pseudoFeatures, coveredTaxa, quantitativeFeaturesThresholds, featureStatesFilter);
 782                 for (Feature parentFeature : featureDependencies.keySet()){
 783                         if (scoreMap.containsKey(parentFeature)){
 784                                 for (Feature dependendFeature : featureDependencies.get(parentFeature)){
 785                                         if (newScoreMap.containsKey(dependendFeature)) {
 786                                                 // if a feature has a better than the "parent" feature it depends on
 787                                                 if (newScoreMap.get(dependendFeature) > scoreMap.get(parentFeature)){
 788                                                         // the "parent" feature gets its score in the original score map
 789                                                         scoreMap.put(parentFeature, newScoreMap.get(dependendFeature));
 790                                                 }
 791                                         }
 792                                 }
 793                         }
 794                 }
 795                 }
 796         }
 797
 798
 799         /**
 800          * This function merges branches of the key belonging to the same clique
 801          * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
 802          *
 803          * @param clique the list of States linked together (i.e. if merged have the same score)
 804          * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
 805          * @param reuseWinner
 806          * @param filter
 807          * @return <code>true</code>, if all taxa covered by the new branch include all states of the clique.
 808          * <code>false</code> otherwise.
 809          */
 810         private void mergeBranches(List<DefinedTermBase<?>> clique, Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> taxonStatesMap,
 811                 Map<Set<KeyTaxon>, Boolean> reuseWinner, Set<DefinedTermBase<?>> filter){
 812
 813             boolean isExact = true;
 814             if (clique.size()<=1){
 815                 return;
 816             }
 817             Map.Entry<Set<KeyTaxon>,List<DefinedTermBase<?>>> firstBranch = null;
 818             List<Set<KeyTaxon>> tdToDelete = new ArrayList<>();
 819             //TODO is the stateFilter needed here somehow?
 820             for (Map.Entry<Set<KeyTaxon>, List<DefinedTermBase<?>>> branch : taxonStatesMap.entrySet()){
 821                     boolean stateFound = false;
 822                         // looks for one state of the clique in this branch
 823                         for(DefinedTermBase<?> state : clique){
 824                             if (branch.getValue().contains(state)) {
 825                     stateFound = true;
 826                     break;
 827                 }
 828                         }
 829                         // if one state is found...
 830                         if (stateFound == true){
 831                                 // ...for the first time, the branch becomes the one to which the others will be merged
 832                                 if (firstBranch == null){
 833                                         firstBranch = branch;
 834                                 }
 835                                 // ... else the branch is merged to the first one.
 836                                 else {
 837                                         int oldSize = firstBranch.getKey().size();
 838                                     firstBranch.getKey().addAll(branch.getKey());
 839                                     int newSize = firstBranch.getKey().size();
 840                     if (oldSize != newSize || newSize != branch.getKey().size()){
 841                         isExact = false;
 842                     }
 843                                     firstBranch.getValue().addAll(branch.getValue());
 844                                         tdToDelete.add(branch.getKey());
 845                                 }
 846                         }
 847                 }
 848                 // once this is done, the branches merged to the first one are deleted
 849                 for (Set<KeyTaxon> td : tdToDelete){
 850                         taxonStatesMap.remove(td);
 851                 }
 852                 if (!isExact && firstBranch != null){
 853                     reuseWinner.put(firstBranch.getKey(), Boolean.TRUE);
 854                 }
 855         }
 856
 857         /**
 858          * This function looks for the largest clique of States
 859          * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
 860          * from the map of exclusions.
 861          *
 862          * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
 863          * @return
 864          */
 865         private List<DefinedTermBase<?>> returnBestClique (Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions){
 866                 int bestNumberOfExclusions=-1;
 867                 int numberOfExclusions;
 868                 List<DefinedTermBase<?>> clique = new ArrayList<>();
 869
 870                 // looks for the largest clique, i.e. the state with less exclusions
 871                 DefinedTermBase<?> bestState=null;
 872                 for (Map.Entry<DefinedTermBase<?>,Set<DefinedTermBase<?>>> pair :  exclusions.entrySet()){
 873                         numberOfExclusions = pair.getValue().size();
 874                         if ((bestNumberOfExclusions == -1) || numberOfExclusions < bestNumberOfExclusions) {
 875                                 bestNumberOfExclusions = numberOfExclusions;
 876                                 bestState = pair.getKey();
 877                         }
 878                 }
 879
 880                 clique.add(bestState);
 881                 exclusions.remove(bestState);
 882
 883                 boolean isNotExcluded;
 884                 // then starts building the clique by adding the states which do not exclude each other
 885                 for (Map.Entry<DefinedTermBase<?>,Set<DefinedTermBase<?>>> pair : exclusions.entrySet()){
 886                         isNotExcluded = true;
 887                         for (DefinedTermBase<?> state : clique) {
 888                                 if (pair.getValue().contains(state)) {
 889                     isNotExcluded = false;
 890                 }
 891                         }
 892                         if (isNotExcluded){
 893                                 clique.add(pair.getKey());
 894                         }
 895                 }
 896                 for (DefinedTermBase<?> state : clique) {
 897                         exclusions.remove(state);
 898                 }
 899                 return clique;
 900         }
 901
 902
 903         /**
 904          * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
 905          *
 906          * @param statesDone the list of states already done for this feature
 907          * @param states2 the element from which the states are extracted
 908          * @param feature the feature corresponding to the CategoricalData
 909          * @param taxaCovered the base of taxa considered
 910          * @param featureStatesFilter
 911          * @return
 912          */
 913         private Map<Set<KeyTaxon>,List<DefinedTermBase<?>>> determineCategoricalStates(
 914                 Set<DefinedTermBase<?>> states, Feature feature, Set<KeyTaxon> taxaCovered, Set<DefinedTermBase<?>> filter){
 915
 916             Map<Set<KeyTaxon>, List<DefinedTermBase<?>>> childrenStatesMap = new HashMap<>();
 917             //TODO needed
 918             List<DefinedTermBase<?>> statesDone = new ArrayList<>(); // the list of states already used
 919
 920                 for (DefinedTermBase<?> state : states){ // for each state
 921                         if (filter != null && !filter.contains(state)){
 922                             continue;
 923                         }
 924                     statesDone.add(state);
 925                         Set<KeyTaxon> newTaxaCovered = taxaByFeatureState(feature, state, taxaCovered); //gets which taxa present this state
 926                         List<DefinedTermBase<?>> statesOfTaxa = childrenStatesMap.get(newTaxaCovered);
 927                         if (statesOfTaxa == null) { // if no states are associated to these taxa, create a new list
 928                                 statesOfTaxa = new ArrayList<>();
 929                                 childrenStatesMap.put(newTaxaCovered, statesOfTaxa);
 930                         }
 931                         statesOfTaxa.add(state); // then add the state to the list
 932                 }
 933                 return childrenStatesMap;
 934         }
 935
 936
 937         /**
 938          * Returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
 939          */
 940         private Set<KeyTaxon> taxaByFeatureState(Feature feature, DefinedTermBase<?> featureState, Set<KeyTaxon> taxaCovered){
 941                 Set<KeyTaxon> newCoveredTaxa = new HashSet<>();
 942                 for (KeyTaxon td : taxaCovered){
 943                         for (CategoricalData cd : td.getCategoricalData(feature)){
 944                             List<StateData> stateDatas = cd.getStateData();
 945                 for (StateData sd : stateDatas) {
 946                     if (sd.getState().equals(featureState)){
 947                         newCoveredTaxa.add(td);
 948                     }
 949                 }
 950                         }
 951                 }
 952                 return newCoveredTaxa;
 953         }
 954
 955         /**
 956          * This function returns the feature with the highest score. However, if several features have the same score
 957          * the one which leads to less options is chosen (this way, the key is easier to read).
 958          */
 959         private Feature lessStatesWinner(Map<Feature,Float> scores, Set<KeyTaxon> taxaCovered){
 960                 int nTaxa = taxaCovered.size();
 961                 if (nTaxa==1) {
 962             return null;
 963         }
 964                 float meanScore = defaultMeanScore(nTaxa);
 965                 float bestScore = nTaxa*nTaxa;
 966                 List<Feature> bestFeatures = new ArrayList<>(); // if ever different features have the best score, they are put in this list
 967                 Feature bestFeature = null;
 968                 float newScore;
 969                 for (Map.Entry<Feature,Float> pair : scores.entrySet()){
 970                         if (pair.getValue()!=null){
 971                                 newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
 972                                 // a feature would have if it divided the taxa in two equal parts
 973                                 if (newScore < bestScore){
 974                                         bestFeatures.clear();
 975                                         bestFeatures.add(pair.getKey());
 976                                         bestScore = newScore;
 977                                 }else if (newScore==bestScore){
 978                                         bestFeatures.add(pair.getKey());
 979                                 }
 980                         }
 981                 }
 982                 if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
 983                         return bestFeatures.get(0);
 984                 }
 985                 else { // else choose the one with less states
 986                         int lessStates = -1;
 987                         int numberOfDifferentStates=-1;
 988                         for (Feature feature : bestFeatures){
 989                                 if (feature.isSupportsCategoricalData()){
 990                                         Set<DefinedTermBase<?>> differentStates = new HashSet<>();
 991                                         for (KeyTaxon taxon : taxaCovered){
 992                                                 Set<CategoricalData> cds = taxon.getCategoricalData(feature);
 993                                                 Set<StateData> allStateData = getStateData(cds);
 994                                                 for (StateData sd : allStateData) {
 995                                                         differentStates.add(sd.getState());
 996                                                 }
 997                                         }
 998                                         numberOfDifferentStates=differentStates.size();
 999                                 }else if (feature.isSupportsQuantitativeData()){
1000                                         numberOfDifferentStates=2;
1001                                 }
1002                                 if (lessStates == -1 || numberOfDifferentStates < lessStates){
1003                                         lessStates = numberOfDifferentStates;
1004                                         bestFeature = feature;
1005                                 }
1006                         }
1007                         return bestFeature;
1008                 }
1009         }
1010
1011         /**
1012          * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
1013          * would have.
1014          */
1015         private float defaultMeanScore(int nTaxa){
1016                 int i;
1017                 float score=0;
1018                 for (i=1;i<nTaxa;i++){
1019                         score = score + Math.round(i+1/2);
1020                 }
1021                 return score;
1022         }
1023
1024         /**
1025          * This function fills the map of features (keys) with their respecting scores (values)
1026          */
1027         private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<KeyTaxon> coveredTaxa,
1028                 Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<DefinedTermBase<?>>> featureStatesFilter){
1029
1030                 Map<Feature,Float> scoreMap = new HashMap<>();
1031                 for (Feature feature : featuresLeft){
1032                         if (feature.isSupportsCategoricalData()) {
1033                                 scoreMap.put(feature, categoricalFeatureScore(feature, coveredTaxa, featureStatesFilter.get(feature)));
1034                         }
1035                         if (feature.isSupportsQuantitativeData()){
1036                                 scoreMap.put(feature, quantitativeFeatureScore(feature, coveredTaxa, quantitativeFeaturesThresholds));
1037                         }
1038                 }
1039                 return scoreMap;
1040         }
1041
1042         /**
1043          * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
1044          * for a given quantitative feature, present either a lower or higher value than a given threshold.
1045          * It returns two Sets of {@link KeyTaxon}, one with the taxa under this threshold (taxaBefore) and another one
1046          * with the taxa over (taxaAfter).
1047          */
1048         private List<Set<KeyTaxon>> determineQuantitativeStates (BigDecimal threshold, Feature feature,
1049                 Set<KeyTaxon> taxa, StringBuilder unit){
1050
1051             List<Set<KeyTaxon>> list = new ArrayList<>();
1052                 Set<KeyTaxon> taxaBefore = new HashSet<>();
1053                 Set<KeyTaxon> taxaAfter = new HashSet<>();
1054                 list.add(taxaBefore);
1055                 list.add(taxaAfter);
1056                 for (KeyTaxon td : taxa){
1057                     for (QuantitativeData qd : td.getQuantitativeData(feature)){
1058                         if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
1059                     unit.append(" " + qd.getUnit().getLabel());
1060                 }
1061                 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1062                 for (StatisticalMeasurementValue smv : values){
1063                     StatisticalMeasure type = smv.getType();
1064                     //TODO DONT FORGET sample size, MEAN etc
1065                     if (type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()) {
1066                         if (smv.getValue().compareTo(threshold) > 0){
1067                             taxaAfter.add(td);
1068                         }
1069                     }
1070                     if (type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()) {
1071                         if (smv.getValue().compareTo(threshold) <= 0){
1072                             taxaBefore.add(td);
1073                         }
1074                     }
1075                 }
1076                     }
1077                 }
1078                 return list;
1079         }
1080
1081         /**
1082          * This function returns the score of a quantitative feature.
1083          */
1084         private float quantitativeFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa,
1085                 Map<Feature,BigDecimal> quantitativeFeaturesThresholds){
1086
1087             List<BigDecimal> allValues = new ArrayList<>();
1088                 for (KeyTaxon td : coveredTaxa){
1089                     for (QuantitativeData qd : td.getQuantitativeData(feature)){
1090                         computeAllValues(allValues, qd);
1091                     }
1092                 }
1093                 int i,j;
1094                 BigDecimal threshold = BigDecimal.ZERO;
1095                 BigDecimal bestThreshold = BigDecimal.ZERO;
1096                 int difference = allValues.size();
1097                 int differenceMin = difference;
1098                 int bestTaxaBefore = 0;
1099                 int bestTaxaAfter = 0;
1100                 for (i=0; i<allValues.size()/2; i++) {
1101                         threshold = allValues.get(i*2+1);
1102                         int taxaBefore=0;
1103                         int taxaAfter=0;
1104                         for (j=0;j<allValues.size()/2;j++) {
1105                                 if (allValues.get(j*2+1).compareTo(threshold) <= 0){
1106                                         taxaBefore++;
1107                                 }
1108                                 if (allValues.get(j*2).compareTo(threshold) > 0){
1109                                         taxaAfter++;
1110                                 }
1111                         }
1112                         difference = Math.abs(taxaBefore-taxaAfter);
1113                         if (difference < differenceMin){
1114                                 differenceMin = difference;
1115                                 bestThreshold = threshold;
1116                                 bestTaxaBefore = taxaBefore;
1117                                 bestTaxaAfter = taxaAfter;
1118                         }
1119                 }
1120                 quantitativeFeaturesThresholds.put(feature, bestThreshold);
1121                 int defaultQuantitativeScore=0;
1122                 for (i=0; i<bestTaxaBefore; i++) {
1123                         defaultQuantitativeScore += bestTaxaAfter;
1124                 }
1125                 return defaultQuantitativeScore;
1126         }
1127
1128     private void computeAllValues(List<BigDecimal> allValues, QuantitativeData qd) {
1129         Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1130         BigDecimal lowerboundary = null;
1131         BigDecimal upperboundary = null;
1132         for (StatisticalMeasurementValue smv : values){
1133                 StatisticalMeasure type = smv.getType();
1134                 BigDecimal value = smv.getValue();
1135                 // DONT FORGET sample size, MEAN etc
1136                 if(type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()){
1137                     if (lowerboundary == null){
1138                         lowerboundary = value;
1139                     }else{
1140                         lowerboundary = lowerboundary.min(value);
1141                     }
1142                 }
1143                 if(type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()){
1144                 if (upperboundary == null){
1145                     upperboundary = value;
1146                 }else{
1147                     upperboundary = upperboundary.max(value);
1148                 }
1149             }
1150
1151 //              if (type.isMax()) {
1152 //                      upperboundary = smv.getValue();
1153 //                      upperboundarypresent=true;
1154 //              }else if (type.equals(StatisticalMeasure.MIN())) {
1155 //                      lowerboundary = smv.getValue();
1156 //                      lowerboundarypresent=true;
1157 //              }else if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
1158 //                      upperboundary = smv.getValue();
1159 //                      upperboundarypresent=true;
1160 //              }else if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
1161 //                      lowerboundary = smv.getValue();
1162 //                      lowerboundarypresent=true;
1163 //              }else if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
1164 //                      lowerboundary = smv.getValue();
1165 //                      upperboundary = lowerboundary;
1166 //                      lowerboundarypresent=true;
1167 //                      upperboundarypresent=true;
1168 //              }
1169         }
1170         if (lowerboundary != null && upperboundary != null) {
1171                 allValues.add(lowerboundary);
1172                 allValues.add(upperboundary);
1173         }else if(lowerboundary != null || upperboundary != null){
1174             logger.warn("Only one of upper or lower boundary is defined. Statistical measurement value not used.");
1175         }
1176     }
1177
1178         /**
1179          * This function returns the score of a categorical feature
1180          * by comparing each taxon with each other. If the feature
1181          * discriminates a single pair of taxa the score is increased.
1182          */
1183         private float categoricalFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa, Set<DefinedTermBase<?>> filter){
1184                 int i,j;
1185                 float score =0;
1186                 float power=0;
1187                 KeyTaxon[] coveredTaxaArray = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1188                 for (i=0 ; i<coveredTaxaArray.length; i++){
1189                         Set<CategoricalData> cd1 = coveredTaxaArray[i].getCategoricalData(feature);
1190                         for (j=i+1 ; j< coveredTaxaArray.length ; j++){
1191                             Set<CategoricalData> cd2 = coveredTaxaArray[j].getCategoricalData(feature);
1192                                 power = defaultCategoricalPower(cd1, cd2, filter);
1193                                 score = score + power;
1194                         }
1195                 }
1196                 return score;
1197         }
1198
1199         /**
1200          * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
1201          */
1202         private void createDependencies(TermNode<Feature> node){
1203
1204             //the featureDependencies handling was originally in defaultCategoricalPower(cat, cat)
1205             //needs to be checked if it is OK to handle them here
1206         if (node.isDependent()){
1207             Feature parentFeature = node.getParent().getTerm();
1208             if (!featureDependencies.containsKey(parentFeature)){
1209                 featureDependencies.put(parentFeature, new HashSet<>());
1210             }
1211             for (FeatureState featureState : node.getOnlyApplicableIf()){
1212                 if (!oAifDependencies.containsKey(featureState)) {
1213                     oAifDependencies.put(featureState, new HashSet<>());
1214                 }
1215                 oAifDependencies.get(featureState).add(node.getTerm());
1216                 //TODO: we only guess that it is the state of the parent feature here
1217                 //needs to be improved
1218                 featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1219             }
1220             for (FeatureState featureState : node.getInapplicableIf()){
1221                 if (!iAifDependencies.containsKey(featureState)) {
1222                     iAifDependencies.put(featureState, new HashSet<>());
1223                 }
1224                 iAifDependencies.get(featureState).add(node.getTerm());
1225                 //TODO: we only guess that it is the state of the parent feature here
1226                 //needs to be improved
1227                 featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1228             }
1229         }
1230
1231                 for (TermNode<Feature> fn : node.getChildNodes()){
1232                         createDependencies(fn);
1233                 }
1234 //              System.out.println(featureDependencies);
1235         }
1236
1237         /**
1238          * This function fills the exclusions map.
1239          */
1240         private float computeExclusions(Feature feature, Set<KeyTaxon> coveredTaxa, Map<DefinedTermBase<?>,Set<DefinedTermBase<?>>> exclusions, Set<DefinedTermBase<?>> filter){
1241                 //unclear what the score is fore here
1242                 float score =0;
1243                 float power=0;
1244                 KeyTaxon[] fixedOrderTaxa = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1245                 for (int i=0 ; i<fixedOrderTaxa.length; i++){
1246                     Set<CategoricalData> cd1 = fixedOrderTaxa[i].getCategoricalData(feature);
1247
1248                         for (int j=i+1 ; j< fixedOrderTaxa.length ; j++){
1249                                 Set<CategoricalData> cd2 = fixedOrderTaxa[j].getCategoricalData(feature);
1250
1251 //                              System.out.println(deb1 + "; " +deb2);
1252                                 power = defaultCategoricalPower(cd1, cd2, filter);
1253                                 score = score + power;
1254                                 if (power >= 1.0){ // if there is no state in common between deb1 and deb2
1255
1256                                         for (StateData statedata1 : getStateData(cd1)){
1257                                             DefinedTermBase<?> state1 = statedata1.getState();
1258                                                 if (!exclusions.containsKey(state1)){
1259                                                         exclusions.put(state1, new HashSet<>());
1260                                                 }
1261                                                 for (StateData statedata2 : getStateData(cd2)){
1262                                                     DefinedTermBase<?> state2 = statedata2.getState();
1263                                                         if (!exclusions.containsKey(state2)){
1264                                                                 exclusions.put(state2, new HashSet<>());
1265                                                         }
1266                                                         exclusions.get(state1).add(state2);
1267                                                         exclusions.get(state2).add(state1);
1268                                                 }
1269                                         }
1270                                 }
1271                         }
1272                 }
1273                 return score;
1274         }
1275
1276     private Set<StateData> getStateData(Set<CategoricalData> cds) {
1277         Set<StateData> result = new HashSet<>();
1278         for (CategoricalData cd : cds){
1279             result.addAll(cd.getStateData());
1280         }
1281         return result;
1282     }
1283
1284     /**
1285          * Returns the score of a categorical feature.
1286          */
1287         private float defaultCategoricalPower(Set<CategoricalData> cd1, Set<CategoricalData> cd2, Set<DefinedTermBase<?>> filter){
1288             if (cd1 == null || cd2 == null ||cd1.isEmpty() || cd2.isEmpty()){
1289                 return 0;
1290             }
1291
1292             //FIXME see defaultCategoricalPower_old for additional code on dependencies
1293             //which has been removed here for now but might be important
1294         //Now I moved it to #createDependencies. Therefore the below is maybe not needed
1295             //anymore but superfluent.
1296             //But the implementation at createDependencies is not fully correct yet
1297             //so I keep it here for now.
1298
1299             for (CategoricalData cd : cd1){
1300                 if (!featureDependencies.containsKey(cd.getFeature())){
1301                     featureDependencies.put(cd.getFeature(), new HashSet<>());
1302                 }
1303                 for (DefinedTermBase<?> state : getStates(cd, filter)){
1304                     if (iAifDependencies.get(state)!=null) {
1305                         featureDependencies.get(cd.getFeature()).addAll(iAifDependencies.get(state));
1306                     }
1307                     if (oAifDependencies.get(state)!=null) {
1308                         featureDependencies.get(cd.getFeature()).addAll(oAifDependencies.get(state));
1309                     }
1310                 }
1311             }
1312
1313             //get all states of both categorical data
1314         Set<DefinedTermBase<?>> states = getStates(cd1, cd2, filter);
1315         if (states.size() == 0){
1316             return 0;
1317         }
1318
1319             int nDiscriminative = 0;
1320             for (DefinedTermBase<?> state : states){
1321                 boolean hasState1 = hasState(state, cd1);
1322                 boolean hasState2 = hasState(state, cd2);
1323                 //if only 1 has the state than the state is discriminative
1324                 if (! (hasState1&&hasState2)) {
1325                     nDiscriminative++;
1326             }
1327             }
1328             float result = (float)nDiscriminative/states.size();
1329             return result;
1330         }
1331
1332     private boolean hasState(DefinedTermBase<?> state, Set<CategoricalData> cds) {
1333         boolean result = false;
1334         for (CategoricalData cd : cds){
1335             for (StateData stateData:cd.getStateData()){
1336                 result |= state.equals(stateData.getState());
1337             }
1338         }
1339         return result;
1340     }
1341
1342     private Set<DefinedTermBase<?>> getStates(Set<CategoricalData> cdset1, Set<CategoricalData> cdset2, Set<DefinedTermBase<?>> filter) {
1343         Set<DefinedTermBase<?>> result = new HashSet<>();
1344         result.addAll(getStates(cdset1, filter));
1345         result.addAll(getStates(cdset2, filter));
1346         return result;
1347     }
1348
1349     private Set<DefinedTermBase<?>> getStates(Set<CategoricalData> cdset, Set<DefinedTermBase<?>> filter) {
1350         Set<DefinedTermBase<?>> result = new HashSet<>();
1351         for (CategoricalData cd : cdset){
1352             result.addAll(getStates(cd, filter));
1353         }
1354         return result;
1355     }
1356
1357     private Set<DefinedTermBase<?>> getStates(CategoricalData cd, Set<DefinedTermBase<?>> filter) {
1358         //TODO handle modifier
1359         Set<DefinedTermBase<?>> result = new HashSet<>();
1360         List<StateData> states = cd.getStateData();
1361         for (StateData stateData:states){
1362             DefinedTermBase<?> state = stateData.getState();
1363             if (filter != null && !filter.contains(state)){
1364                 continue;
1365             }
1366             result.add(stateData.getState());
1367         }
1368         return result;
1369     }
1370
1371     //TODO keep as long as the handling of featureDependencies is not yet checked or handled in
1372     //the new method defaultCategoricalPower()
1373     private float defaultCategoricalPower_old(CategoricalData deb1, CategoricalData deb2){
1374                 List<StateData> states1 = deb1.getStateData();
1375                 List<StateData> states2 = deb2.getStateData();
1376                 boolean bool = false;
1377                 Iterator<StateData> stateData1Iterator = states1.iterator() ;
1378                 //      while (!bool && stateData1Iterator.hasNext()) {
1379                 //          Iterator<StateData> stateData2Iterator = states2.iterator() ;
1380                 //          StateData stateData1 = stateData1Iterator.next();
1381                 //          while (!bool && stateData2Iterator.hasNext()) {
1382                 //              bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
1383                 //          }
1384                 //      }
1385                 // one point each time two taxa can be discriminated for a given feature
1386
1387                 boolean checkFeature = false;
1388
1389                 if (!featureDependencies.containsKey(deb1.getFeature())){
1390                     featureDependencies.put(deb1.getFeature(), new HashSet<>());
1391                     checkFeature = true;
1392                 }
1393
1394                 while (stateData1Iterator.hasNext()) {
1395                     Iterator<StateData> stateData2Iterator = states2.iterator() ;
1396                     StateData stateData1 = stateData1Iterator.next();
1397                     DefinedTermBase<?> state1 = stateData1.getState();
1398                     if (checkFeature){
1399                         if (iAifDependencies.get(state1)!=null) {
1400                             featureDependencies.get(deb1.getFeature()).addAll(iAifDependencies.get(state1));
1401                         }
1402                         if (oAifDependencies.get(state1)!=null) {
1403                             featureDependencies.get(deb1.getFeature()).addAll(oAifDependencies.get(state1));
1404                         }
1405                     }
1406                     while (stateData2Iterator.hasNext()) {
1407                         StateData stateData2 = stateData2Iterator.next();
1408                         DefinedTermBase<?> state2 = stateData2.getState();
1409                         bool = bool || state1.equals(state2); // checks if the states are the same
1410                     }
1411                 }
1412
1413                 if (bool) {
1414                     return 0;
1415                 } else {
1416                     return 1;
1417                 }
1418             }
1419 }