cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/generate/PolytomousKeyGenerator.java

   1 package eu.etaxonomy.cdm.strategy.generate;
   2
   3 import java.util.ArrayList;
   4 import java.util.HashMap;
   5 import java.util.HashSet;
   6 import java.util.Iterator;
   7 import java.util.List;
   8 import java.util.Map;
   9 import java.util.Set;
  10
  11 import eu.etaxonomy.cdm.model.common.Language;
  12 import eu.etaxonomy.cdm.model.description.CategoricalData;
  13 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  14 import eu.etaxonomy.cdm.model.description.Feature;
  15 import eu.etaxonomy.cdm.model.description.KeyStatement;
  16 import eu.etaxonomy.cdm.model.description.PolytomousKey;
  17 import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
  18 import eu.etaxonomy.cdm.model.description.QuantitativeData;
  19 import eu.etaxonomy.cdm.model.description.State;
  20 import eu.etaxonomy.cdm.model.description.StateData;
  21 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
  22 import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
  23 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  24 import eu.etaxonomy.cdm.model.term.FeatureNode;
  25 import eu.etaxonomy.cdm.model.term.FeatureTree;
  26
  27 /**
  28  * @author m.venin
  29  *
  30  */
  31 public class PolytomousKeyGenerator {
  32
  33         static int level=-1; // global variable needed by the printTree function in order to store the level which is being printed
  34         private PolytomousKey polytomousKey; // the Identification Key
  35         private List<Feature> features; // the features used to generate the key
  36         private Set<TaxonDescription> taxa; // the base of taxa
  37
  38         private boolean merge=true; // if this boolean is set to true, branches of the tree will be merged if the corresponding states can be used together without decreasing their score
  39
  40         private FeatureTree dependenciesTree; // the tree containing the dependencies between states and features (InapplicableIf and OnlyApplicableIf)
  41         private Map<State,Set<Feature>> iIdependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
  42         private Map<State,Set<Feature>> oAIdependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
  43         private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
  44         private boolean dependenciesON = true; // if this boolean is true, the dependencies are taken into account
  45
  46
  47         /**
  48          * These strings are used for generating the statements of the key.
  49          */
  50         private String before="<";
  51         private String after=">";
  52         private String separator = " or ";
  53
  54         /**
  55          * Sets the features used to generate the key.
  56          *
  57          * @param featuresList
  58          */
  59         public void setFeatures(List<Feature> featuresList){
  60                 this.features = featuresList;
  61         }
  62
  63         /**
  64          * Sets the base of taxa.
  65          *
  66          * @param featuresList
  67          */
  68         public void setTaxa(Set<TaxonDescription> taxaSet){
  69                 this.taxa = taxaSet;
  70         }
  71
  72         /**
  73          * Sets the tree containing the dependencies between states and features.
  74          *
  75          * @param tree
  76          */
  77         public void setDependencies(FeatureTree tree){
  78                 this.dependenciesTree = tree;
  79         }
  80
  81         /**
  82          * Allows the generator to use the dependencies given by the function "setDependencies".
  83          */
  84         public void turnDependenciesON(){
  85                 this.dependenciesON = true;
  86         }
  87
  88         /**
  89          * Prevent the generator from using dependencies.
  90          */
  91         public void turnDependenciesOFF(){
  92                 this.dependenciesON = false;
  93         }
  94
  95         /**
  96          * Allows the generator to merge branches if the corresponding states can be used together without diminishing their score.
  97          */
  98         public void mergeModeON(){
  99                 this.merge = true;
 100         }
 101
 102         /**
 103          * Prevent the generator from merging branches (apart from those leading to the same set of taxa).
 104          */
 105         public void mergeModeOFF(){
 106                 this.merge = false;
 107         }
 108
 109
 110         /**
 111          * Initializes the function buildBranches() with the starting parameters in order to build the key
 112          */
 113         private void loop(){
 114                 polytomousKey = PolytomousKey.NewInstance();
 115                 PolytomousKeyNode root = polytomousKey.getRoot();
 116                 buildBranches(root,features,taxa,true);
 117         }
 118
 119
 120         /**
 121          * Creates the key and prints it
 122          */
 123         public PolytomousKey invoke(){
 124                 if (dependenciesON && dependenciesTree!=null){
 125                         checkDependencies(dependenciesTree.getRoot());
 126                 }
 127                 loop();
 128                 List<PolytomousKeyNode> rootlist = new ArrayList<PolytomousKeyNode>();
 129                 rootlist.add(polytomousKey.getRoot());
 130                 //              String spaces = new String();
 131                 //              printTree(rootlist,spaces);
 132                 //              System.out.println(paths.toString());
 133                 return polytomousKey;
 134         }
 135
 136
 137         /**
 138          * Recursive function that builds the branches of the identification key (FeatureTree)
 139          *
 140          * @param father the node considered
 141          * @param featuresLeft List of features that can be used at this point
 142          * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
 143          * @param taxaAreTheSame if in the previous level the taxa discriminated are the same, this boolean is set to true, thus if the taxa, again, are not discriminated at this level the function stops
 144          */
 145         private void buildBranches(PolytomousKeyNode father, List<Feature> featuresLeft, Set<TaxonDescription> taxaCovered, boolean taxaDiscriminated){
 146                 boolean childrenExist = false; // boolean indicating if this node has children, if not, it means we reached a leaf and instead of adding a
 147                 // question or statement to it, we must add the list of taxa discriminated at this point
 148
 149                 // These sets respectively store the features inapplicables and applicables at this point in the key
 150                 Set<Feature> innapplicables = new HashSet<Feature>();
 151                 Set<Feature> applicables = new HashSet<Feature>();
 152
 153                 if (taxaCovered.size()>1) {
 154                         // this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
 155                         Map<Feature,Float> quantitativeFeaturesThresholds = new HashMap<Feature,Float>();
 156                         // the scores of the different features are calculated, the thresholds in the same time
 157                         Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
 158                         dependenciesScores(scoreMap,featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
 159                         // the feature with the best score becomes the one corresponding to the current node
 160                         Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
 161                         // the feature is removed from the list of features available to build the next level of the tree
 162                         featuresLeft.remove(winnerFeature);
 163
 164                         int i;
 165                         /************** either the feature supports quantitative data... **************/
 166                         // NB: in this version, "quantitative features" are dealt with in a dichotomous way
 167                         if (winnerFeature!=null && winnerFeature.isSupportsQuantitativeData()) {
 168                                 // first, get the threshold
 169                                 float threshold = quantitativeFeaturesThresholds.get(winnerFeature);
 170                                 String sign;
 171                                 StringBuilder unit= new StringBuilder("");
 172                                 // then determine which taxa are before and which are after this threshold (dichotomy) in order to create the children of the father node
 173                                 List<Set<TaxonDescription>> quantitativeStates = determineQuantitativeStates(threshold,winnerFeature,taxaCovered,unit);
 174                                 // thus the list contains two sets of TaxonDescription, the first corresponding to those before, the second to those after the threshold
 175                                 for (i=0;i<2;i++) {
 176                                         Set<TaxonDescription> newTaxaCovered = quantitativeStates.get(i);
 177                                         if (i==0){
 178                                                 sign = before; // the first element of the list corresponds to taxa before the threshold
 179                                         } else {
 180                                                 sign = after; // the second to those after
 181                                         }
 182                                         if (newTaxaCovered.size()>0 && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
 183                                                 childrenExist = true;
 184                                                 PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
 185                                                 son.setFeature(winnerFeature);
 186                                                 // old code used when PolytomousKey extended FeatureTree
 187                                                 //                                              Representation question = new Representation(null, " " + sign + " " + threshold +unit,null, Language.DEFAULT()); // the question attribute is used to store the state of the feature
 188                                                 //                                              son.addQuestion(question);
 189                                                 KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold +unit); // the question attribute is used to store the state of the feature
 190                                                 son.setStatement(statement);
 191                                                 father.addChild(son);
 192                                                 boolean areTheTaxaDiscriminated;
 193                                                 if (newTaxaCovered.size()==taxaCovered.size()) {
 194                             areTheTaxaDiscriminated = false;
 195                         } else {
 196                             areTheTaxaDiscriminated = true;
 197                         }
 198                                                 buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
 199                                         }
 200                                 }
 201                         }
 202
 203                         /************** ...or it supports categorical data. **************/
 204                         // "categorical features" may present several different states, each one of these might correspond to one child
 205                         List<State> statesDone = new ArrayList<State>(); // the list of states already used
 206                         int numberOfStates;
 207                         if (winnerFeature!=null && winnerFeature.isSupportsCategoricalData()) {
 208                                 for (TaxonDescription td : taxaCovered){
 209                                         DescriptionElementBase debConcerned = null;
 210
 211                                         // first, get the right DescriptionElementBase
 212                                         for (DescriptionElementBase deb : td.getElements()) {
 213                                                 if (deb.getFeature().equals(winnerFeature)) {
 214                             debConcerned = deb;
 215                         }
 216                                         }
 217
 218                                         if (debConcerned!=null) {
 219                                                 // a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
 220                                                 Map<Set<TaxonDescription>,List<State>> taxonStatesMap = determineCategoricalStates(statesDone,(CategoricalData)debConcerned,winnerFeature,taxaCovered);
 221                                                 // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
 222                                                 if (merge){
 223                                                         // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
 224                                                         Map<State,Set<State>> exclusions = new HashMap<State,Set<State>>();
 225                                                         featureScoreAndMerge(winnerFeature,taxaCovered,exclusions);
 226
 227                                                         while (!exclusions.isEmpty()){
 228                                                                 // looks for the largest clique, i.e. the state with less exclusions
 229                                                                 List<State> clique = returnBestClique(exclusions);
 230                                                                 // then merges the corresponding branches
 231                                                                 mergeBranches(clique,taxonStatesMap);
 232                                                         }
 233                                                 }
 234                                                 if (taxonStatesMap!=null && !taxonStatesMap.isEmpty()) {
 235                                                         for (Map.Entry<Set<TaxonDescription>,List<State>> entry : taxonStatesMap.entrySet()){
 236                                                                 Set<TaxonDescription> newTaxaCovered = entry.getKey();
 237                                                                 List<State> listOfStates = entry.getValue();
 238                                                                 if ((newTaxaCovered.size()>0) && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
 239                                                                         childrenExist = true;
 240                                                                         PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
 241                                                                         StringBuilder questionLabel = new StringBuilder();
 242                                                                         numberOfStates = listOfStates.size()-1;
 243                                                                         for (State state : listOfStates) {
 244                                                                                 if (dependenciesON){
 245                                                                                         // if the dependencies are considered, removes and adds the right features from/to the list of features left
 246                                                                                         // these features are stored in order to be put back again when the current branch is finished
 247                                                                                         if (iIdependencies.get(state)!= null) {
 248                                                 innapplicables.addAll(iIdependencies.get(state));
 249                                             }
 250                                                                                         if (oAIdependencies.get(state)!= null) {
 251                                                 applicables.addAll(oAIdependencies.get(state));
 252                                             }
 253                                                                                         for (Feature feature : innapplicables) {
 254                                                 featuresLeft.remove(feature);
 255                                             }
 256                                                                                         for (Feature feature : applicables) {
 257                                                 featuresLeft.add(feature);
 258                                             }
 259                                                                                 }
 260                                                                                 questionLabel.append(state.getLabel());
 261                                                                                 if (listOfStates.lastIndexOf(state)!=numberOfStates)
 262                                          {
 263                                             questionLabel.append(separator); // append a separator after each state except for the last one
 264                                         }
 265                                                                         }
 266                                                                         // old code used when PolytomousKey extended FeatureTree
 267                                                                         //                                                                      Representation question = new Representation(null, questionLabel.toString(),null, Language.DEFAULT());
 268                                                                         //                                                                      son.addQuestion(question);
 269                                                                         KeyStatement statement = KeyStatement.NewInstance(questionLabel.toString());
 270                                                                         son.setStatement(statement);
 271                                                                         son.setFeature(winnerFeature);
 272                                                                         father.addChild(son);
 273                                                                         featuresLeft.remove(winnerFeature); // unlike for quantitative features, once a categorical one has been used, it cannot be reused in the same branch
 274                                                                         boolean areTheTaxaDiscriminated;
 275                                                                         if (newTaxaCovered.size()==taxaCovered.size()) {
 276                                         areTheTaxaDiscriminated = true;
 277                                     } else {
 278                                         areTheTaxaDiscriminated = false;
 279                                     }
 280                                                                         buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
 281                                                                 }
 282                                                         }
 283                                                 }
 284                                         }
 285                                 }
 286                         }
 287                         // the features depending on other features are added/removed to/from the features left once the branch is done
 288                         if (dependenciesON){
 289                                 for (Feature feature : innapplicables) {
 290                     featuresLeft.add(feature);
 291                 }
 292                                 for (Feature feature : applicables) {
 293                     featuresLeft.remove(feature);
 294                 }
 295                         }
 296                         // the winner features are put back to the features left once the branch is done
 297                         featuresLeft.add(winnerFeature);
 298                 }
 299                 // if the node is a leaf, the statement contains the list of taxa to which the current leaf leads.
 300                 if (!childrenExist){
 301                         KeyStatement fatherStatement = father.getStatement();
 302                         if (fatherStatement!=null){
 303                                 String statementString = fatherStatement.getLabelText(Language.DEFAULT());
 304                                 if (statementString !=null && taxaCovered != null){
 305                                         String label = statementString + " --> " + taxaCovered.toString();
 306                                         fatherStatement.putLabel(Language.DEFAULT(), label);
 307                                 }
 308                         }
 309                 }
 310         }
 311
 312
 313
 314         /**
 315          * Dependencies can be seen as a hierarchy.
 316          * If the dependencies are on, this function calculates the score of all children features and give the highest score
 317          * of these to their father (of course, if its score is lower). This way, if a feature has a good score but is
 318          * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
 319          *
 320          * @param scoreMap
 321          * @param featuresLeft
 322          * @param coveredTaxa
 323          * @param quantitativeFeaturesThresholds
 324          */
 325         private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
 326                 // first copies the list of features left
 327                 List<Feature> pseudoFeaturesLeft = featuresLeft.subList(0, featuresLeft.size()-1);
 328                 // then adds all the features which depend on features contained in the list
 329                 for (Feature feature : pseudoFeaturesLeft){
 330                         if (featureDependencies.containsKey(feature)){
 331                                 for (Feature feature2 : featureDependencies.get(feature)){
 332                                         if (!pseudoFeaturesLeft.contains(feature2)){
 333                                                 pseudoFeaturesLeft.add(feature2);
 334                                         }
 335                                 }
 336                         }
 337                 }
 338                 // then calculates the scores of all features that have been added
 339                 Map<Feature,Float> newScoreMap = featureScores(pseudoFeaturesLeft, coveredTaxa, quantitativeFeaturesThresholds);
 340                 for (Feature feature : featureDependencies.keySet()){
 341                         if (newScoreMap.containsKey(feature)){
 342                                 for (Feature feature2 : featureDependencies.get(feature)){
 343                                         if (newScoreMap.containsKey(feature2)) {
 344                                                 // if a feature has a better than the "father" feature it depends on
 345                                                 if (newScoreMap.get(feature2)>newScoreMap.get(feature)){
 346                                                         // the "father" feature gets its score
 347                                                         scoreMap.put(feature, newScoreMap.get(feature2));
 348                                                 }
 349                                         }
 350                                 }
 351                         }
 352                 }
 353         }
 354
 355
 356         /**
 357          * This function merges branches of the key belonging to the same clique
 358          * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
 359          *
 360          * @param clique the list of States linked together (i.e. if merged have the same score)
 361          * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
 362          */
 363         private void mergeBranches(List<State> clique, Map<Set<TaxonDescription>, List<State>> taxonStatesMap){
 364                 boolean stateFound;
 365                 Map.Entry<Set<TaxonDescription>,List<State>> firstBranch=null;
 366                 List<Set<TaxonDescription>> tdToDelete = new ArrayList<Set<TaxonDescription>>();
 367
 368                 if (clique.size()>1){
 369                         Iterator<Map.Entry<Set<TaxonDescription>, List<State>>> it1 = taxonStatesMap.entrySet().iterator();
 370                         while (it1.hasNext()){
 371                                 Map.Entry<Set<TaxonDescription>,List<State>> branch = it1.next();
 372                                 Iterator<State> stateIterator = clique.iterator();
 373                                 stateFound=false;
 374                                 // looks for one state of the clique in this branch
 375                                 while(stateIterator.hasNext() && stateFound!=true) {
 376                                         State state = stateIterator.next();
 377                                         if (branch.getValue().contains(state)) {
 378                                                 stateFound=true;
 379                                         }
 380                                 }
 381                                 // if one state is found...
 382                                 if (stateFound==true){
 383                                         // ...for the first time, the branch becomes the one to which the others will be merged
 384                                         if (firstBranch==null){
 385                                                 firstBranch=branch;
 386                                         }
 387                                         // ... else the branch is merged to the first one.
 388                                         else {
 389                                                 firstBranch.getKey().addAll(branch.getKey());
 390                                                 firstBranch.getValue().addAll(branch.getValue());
 391                                                 tdToDelete.add(branch.getKey());
 392                                         }
 393                                 }
 394                         }
 395                         // once this is done, the branches merged to the first one are deleted
 396                         for (Set<TaxonDescription> td : tdToDelete){
 397                                 taxonStatesMap.remove(td);
 398                         }
 399                 }
 400         }
 401
 402
 403
 404         /**
 405          * This function looks for the largest clique of States
 406          * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
 407          * from the map of exclusions.
 408          *
 409          * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
 410          * @return
 411          */
 412         private List<State> returnBestClique (Map<State,Set<State>> exclusions){
 413                 int bestNumberOfExclusions=-1;;
 414                 int numberOfExclusions;
 415                 List<State> clique = new ArrayList<State>();
 416
 417                 // looks for the largest clique, i.e. the state with less exclusions
 418                 State bestState=null;
 419                 for (Iterator<Map.Entry<State,Set<State>>> it1 = exclusions.entrySet().iterator() ; it1.hasNext();){
 420                         Map.Entry<State,Set<State>> pair = it1.next();
 421                         numberOfExclusions = pair.getValue().size();
 422                         if ((bestNumberOfExclusions==-1) || numberOfExclusions<bestNumberOfExclusions) {
 423                                 bestNumberOfExclusions=numberOfExclusions;
 424                                 bestState = pair.getKey();
 425                         }
 426                 }
 427
 428                 clique.add(bestState);
 429                 exclusions.remove(bestState);
 430
 431                 boolean isNotExcluded;
 432                 // then starts building the clique by adding the states which do not exclude each other
 433                 for (Iterator<Map.Entry<State,Set<State>>> iterator = exclusions.entrySet().iterator() ; iterator.hasNext();){
 434                         Map.Entry<State,Set<State>> pair = iterator.next();
 435                         isNotExcluded = true;
 436                         for (State state : clique) {
 437                                 if (pair.getValue().contains(state)) {
 438                     isNotExcluded = false;
 439                 }
 440                         }
 441                         if (isNotExcluded){
 442                                 clique.add(pair.getKey());
 443                         }
 444                 }
 445                 for (State state : clique) {
 446                         exclusions.remove(state);
 447                 }
 448                 return clique;
 449         }
 450
 451
 452         /**
 453          * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
 454          *
 455          * @param statesDone the list of states already done for this feature
 456          * @param categoricalData the element from which the states are extracted
 457          * @param feature the feature corresponding to the CategoricalData
 458          * @param taxaCovered the base of taxa considered
 459          * @return
 460          */
 461         private Map<Set<TaxonDescription>,List<State>> determineCategoricalStates(List<State> statesDone, CategoricalData categoricalData, Feature feature, Set<TaxonDescription> taxaCovered){
 462                 Map<Set<TaxonDescription>,List<State>> childrenStatesMap = new HashMap<Set<TaxonDescription>,List<State>>();
 463
 464                 List<StateData> stateDatas = categoricalData.getStateData();
 465
 466                 List<State> states = new ArrayList<State>();
 467                 for (StateData sd : stateDatas){
 468                         states.add(sd.getState());
 469                 }
 470
 471                 for (State featureState : states){ // for each state
 472                         if(!statesDone.contains(featureState)){ // if the state hasn't already be considered
 473                                 statesDone.add(featureState);
 474                                 Set<TaxonDescription> newTaxaCovered = whichTaxa(feature,featureState,taxaCovered); //gets which taxa present this state
 475                                 List<State> newStates = childrenStatesMap.get(newTaxaCovered);
 476                                 if (newStates==null) { // if no states are associated to these taxa, create a new list
 477                                         newStates = new ArrayList<State>();
 478                                         childrenStatesMap.put(newTaxaCovered,newStates);
 479                                 }
 480                                 newStates.add(featureState); // then add the state to the list
 481                         }
 482                 }
 483                 return childrenStatesMap;
 484         }
 485
 486
 487         /**
 488          * returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
 489          *
 490          * @param feature
 491          * @param featureState
 492          * @param taxaCovered
 493          * @return
 494          */
 495         private Set<TaxonDescription> whichTaxa(Feature feature, State featureState, Set<TaxonDescription> taxaCovered){
 496                 Set<TaxonDescription> newCoveredTaxa = new HashSet<TaxonDescription>();
 497                 for (TaxonDescription td : taxaCovered){
 498                         Set<DescriptionElementBase> elements = td.getElements();
 499                         for (DescriptionElementBase deb : elements){
 500                                 if (deb.isInstanceOf(CategoricalData.class)) {
 501                                         if (deb.getFeature().equals(feature)) {
 502                                                 List<StateData> stateDatas = ((CategoricalData)deb).getStateData();
 503                                                 for (StateData sd : stateDatas) {
 504                                                         if (sd.getState().equals(featureState)){
 505                                                                 newCoveredTaxa.add(td);
 506                                                         }
 507                                                 }
 508                                         }
 509                                 }
 510                         }
 511                 }
 512                 return newCoveredTaxa;
 513         }
 514
 515
 516         /**
 517          * This function returns the feature with the highest score. However, if several features have the same score
 518          * the one wich leads to less options is chosen (this way, the key is easier to read).
 519          *
 520          * @param nTaxa
 521          * @param scores
 522          * @param taxaCovered
 523          * @return
 524          */
 525         private Feature lessStatesWinner(Map<Feature,Float> scores, Set<TaxonDescription> taxaCovered){
 526                 int nTaxa = taxaCovered.size();
 527                 if (nTaxa==1) {
 528             return null;
 529         }
 530                 float meanScore = defaultMeanScore(nTaxa);
 531                 float bestScore = nTaxa*nTaxa;
 532                 List<Feature> bestFeatures = new ArrayList<Feature>(); // if ever different features have the best score, they are put in this list
 533                 Feature bestFeature = null;
 534                 Iterator<Map.Entry<Feature,Float>> it = scores.entrySet().iterator();
 535                 float newScore;
 536                 while (it.hasNext()){
 537                         Map.Entry<Feature,Float> pair = it.next();
 538                         if (pair.getValue()!=null){
 539                                 newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
 540                                 // a feature would have if it divided the taxa in two equal parts
 541                                 if (newScore < bestScore){
 542                                         bestFeatures.clear();
 543                                         bestFeatures.add(pair.getKey());
 544                                         bestScore = newScore;
 545                                 }else if (newScore==bestScore){
 546                                         bestFeatures.add(pair.getKey());
 547                                 }
 548                         }
 549                 }
 550                 if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
 551                         return bestFeatures.get(0);
 552                 }
 553                 else { // else choose the one with less states
 554                         int lessStates=-1;
 555                         int numberOfDifferentStates=-1;
 556                         for (Feature feature : bestFeatures){
 557                                 if (feature.isSupportsCategoricalData()){
 558                                         Set<State> differentStates = new HashSet<State>();
 559                                         for (TaxonDescription td : taxaCovered){
 560                                                 Set<DescriptionElementBase> elements = td.getElements();
 561                                                 for (DescriptionElementBase deb : elements){
 562                                                         if (deb.isInstanceOf(CategoricalData.class)) {
 563                                                                 CategoricalData catdat = (CategoricalData)deb;
 564                                                                 if (catdat.getFeature().equals(feature)) {
 565                                                                         List<StateData> stateDatas = catdat.getStateData();
 566                                                                         for (StateData sd : stateDatas) {
 567                                                                                 differentStates.add(sd.getState());
 568                                                                         }
 569                                                                 }
 570                                                         }
 571                                                 }
 572                                         }
 573                                         numberOfDifferentStates=differentStates.size();
 574                                 }else if (feature.isSupportsQuantitativeData()){
 575                                         numberOfDifferentStates=2;
 576                                 }
 577                                 if (lessStates==-1 || numberOfDifferentStates<lessStates){
 578                                         lessStates=numberOfDifferentStates;
 579                                         bestFeature = feature;
 580                                 }
 581                         }
 582                         return bestFeature;
 583                 }
 584         }
 585
 586
 587         /**
 588          * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
 589          * would have.
 590          *
 591          * @param nTaxa
 592          * @return
 593          */
 594         private float defaultMeanScore(int nTaxa){
 595                 int i;
 596                 float score=0;
 597                 for (i=1;i<nTaxa;i++){
 598                         score = score + Math.round(i+1/2);
 599                 }
 600                 return score;
 601         }
 602
 603
 604         /**
 605          * This function fills the map of features (keys) with their respecting scores (values)
 606          *
 607          * @param featuresLeft
 608          * @param coveredTaxa
 609          * @param quantitativeFeaturesThresholds
 610          * @return
 611          */
 612         private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
 613                 Map<Feature,Float> scoreMap = new HashMap<Feature,Float>();
 614                 for (Feature feature : featuresLeft){
 615                         if (feature.isSupportsCategoricalData()) {
 616                                 scoreMap.put(feature, categoricalFeatureScore(feature,coveredTaxa));
 617                         }
 618                         if (feature.isSupportsQuantitativeData()){
 619                                 scoreMap.put(feature, quantitativeFeatureScore(feature,coveredTaxa, quantitativeFeaturesThresholds));
 620                         }
 621                 }
 622                 return scoreMap;
 623         }
 624
 625         /**
 626          * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
 627          * for a given quantitative feature, present either a lower or higher value than a given threshold.
 628          * It returns two Sets of TaxonDescription, one with the taxa under this threshold (taxaBefore) and another one
 629          * with the taxa over (taxaAfter).
 630          *
 631          * @param threshold
 632          * @param feature
 633          * @param taxa
 634          * @param unit
 635          * @return
 636          */
 637         private List<Set<TaxonDescription>> determineQuantitativeStates (Float threshold, Feature feature, Set<TaxonDescription> taxa, StringBuilder unit){
 638                 List<Set<TaxonDescription>> list = new ArrayList<Set<TaxonDescription>>();
 639                 Set<TaxonDescription> taxaBefore = new HashSet<TaxonDescription>();
 640                 Set<TaxonDescription> taxaAfter = new HashSet<TaxonDescription>();
 641                 list.add(taxaBefore);
 642                 list.add(taxaAfter);
 643                 for (TaxonDescription td : taxa){
 644                         Set<DescriptionElementBase> elements = td.getElements();
 645                         for (DescriptionElementBase deb : elements){
 646                                 if (deb.getFeature().equals(feature)) {
 647                                         if (deb.isInstanceOf(QuantitativeData.class)) {
 648                                                 QuantitativeData qd = (QuantitativeData)deb;
 649                                                 if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
 650                                                         unit.append(" " + qd.getUnit().getLabel());
 651                                                 }
 652                                                 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
 653                                                 for (StatisticalMeasurementValue smv : values){
 654                                                         StatisticalMeasure type = smv.getType();
 655                                                         //TODO DONT FORGET sample size, MEAN etc
 656                                                         if (type.equals(StatisticalMeasure.MAX()) || type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
 657                                                                 if (smv.getValue()>threshold){
 658                                                                         taxaAfter.add(td);
 659                                                                 }
 660                                                         }
 661                                                         if (type.equals(StatisticalMeasure.MIN()) || type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
 662                                                                 if (smv.getValue()<=threshold){
 663                                                                         taxaBefore.add(td);
 664                                                                 }
 665                                                         }
 666                                                 }
 667                                         }
 668                                 }
 669                         }
 670                 }
 671                 return list;
 672         }
 673
 674
 675
 676         /**
 677          * This function returns the score of a quantitative feature.
 678          *
 679          * @param feature
 680          * @param coveredTaxa
 681          * @param quantitativeFeaturesThresholds
 682          * @return
 683          */
 684         private float quantitativeFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
 685                 List<Float> allValues = new ArrayList<Float>();
 686                 boolean lowerboundarypresent;
 687                 boolean upperboundarypresent;
 688                 float lowerboundary=0;
 689                 float upperboundary=0;
 690                 for (TaxonDescription td : coveredTaxa){
 691                         Set<DescriptionElementBase> elements = td.getElements();
 692                         for (DescriptionElementBase deb : elements){
 693                                 if (deb.getFeature().equals(feature)) {
 694                                         if (deb.isInstanceOf(QuantitativeData.class)) {
 695                                                 QuantitativeData qd = (QuantitativeData)deb;
 696                                                 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
 697                                                 lowerboundarypresent = false;
 698                                                 upperboundarypresent = false;
 699                                                 for (StatisticalMeasurementValue smv : values){
 700                                                         StatisticalMeasure type = smv.getType();
 701                                                         // DONT FORGET sample size, MEAN etc
 702                                                         if (type.equals(StatisticalMeasure.MAX())) {
 703                                                                 upperboundary = smv.getValue();
 704                                                                 upperboundarypresent=true;
 705                                                         }
 706                                                         if (type.equals(StatisticalMeasure.MIN())) {
 707                                                                 lowerboundary = smv.getValue();
 708                                                                 lowerboundarypresent=true;
 709                                                         }
 710                                                         if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
 711                                                                 upperboundary = smv.getValue();
 712                                                                 upperboundarypresent=true;
 713                                                         }
 714                                                         if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
 715                                                                 lowerboundary = smv.getValue();
 716                                                                 lowerboundarypresent=true;
 717                                                         }
 718                                                         if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
 719                                                                 lowerboundary = smv.getValue();
 720                                                                 upperboundary = lowerboundary;
 721                                                                 lowerboundarypresent=true;
 722                                                                 upperboundarypresent=true;
 723                                                         }
 724                                                 }
 725                                                 if (lowerboundarypresent && upperboundarypresent) {
 726                                                         allValues.add(lowerboundary);
 727                                                         allValues.add(upperboundary);
 728                                                 }
 729                                         }
 730                                 }
 731                         }
 732                 }
 733                 int i,j;
 734                 float threshold=0;
 735                 float bestThreshold=0;
 736                 int difference=allValues.size();
 737                 int differenceMin = difference;
 738                 int taxaBefore=0;
 739                 int taxaAfter=0;
 740                 for (i=0;i<allValues.size()/2;i++) {
 741                         threshold = allValues.get(i*2+1);
 742                         taxaBefore=0;
 743                         taxaAfter=0;
 744                         for (j=0;j<allValues.size()/2;j++) {
 745                                 if (allValues.get(j*2+1)<=threshold){
 746                                         taxaBefore++;
 747                                 }
 748                                 if (allValues.get(j*2)>threshold){
 749                                         taxaAfter++;
 750                                 }
 751                         }
 752                         difference = Math.abs(taxaBefore-taxaAfter);
 753                         if (difference<differenceMin){
 754                                 differenceMin=difference;
 755                                 bestThreshold = threshold;
 756                         }
 757                 }
 758                 quantitativeFeaturesThresholds.put(feature, bestThreshold);
 759                 int defaultQuantitativeScore=0;
 760                 for (i=0;i<taxaBefore;i++) {
 761                         defaultQuantitativeScore += taxaAfter - i;
 762                 }
 763                 return (defaultQuantitativeScore);
 764         }
 765
 766
 767
 768         /**
 769          * This function returns the score of a categorical feature.
 770          *
 771          * @param feature
 772          * @param coveredTaxa
 773          * @return
 774          */
 775         private float categoricalFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa){
 776                 int i,j;
 777                 float score =0;
 778                 float power=0;
 779                 TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
 780                 for (i=0 ; i<coveredTaxaArray.length; i++){
 781                         Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
 782                         DescriptionElementBase deb1 = null;
 783                         for (DescriptionElementBase deb : elements1){
 784                                 if (deb.getFeature().equals(feature)){
 785                                         deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
 786                                 }
 787                         }
 788                         for (j=i+1 ; j< coveredTaxaArray.length ; j++){
 789                                 Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
 790                                 DescriptionElementBase deb2 = null;
 791                                 for (DescriptionElementBase deb : elements2){
 792                                         if (deb.getFeature().equals(feature)){
 793                                                 deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
 794                                         }
 795                                 }
 796                                 power = defaultPower(deb1,deb2);
 797                                 score = score + power;
 798                         }
 799                 }
 800                 return score;
 801         }
 802
 803
 804         /**
 805          * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
 806          *
 807          * @param node
 808          */
 809         private void checkDependencies(FeatureNode<Feature> node){
 810                 if (node.getOnlyApplicableIf()!=null){
 811                         Set<State> addToOAI = node.getOnlyApplicableIf();
 812                         for (State state : addToOAI){
 813                                 if (oAIdependencies.containsKey(state)) {
 814                     oAIdependencies.put(state, new HashSet<Feature>());
 815                 }
 816                                 oAIdependencies.get(state).add(node.getTerm());
 817                         }
 818                 }
 819                 if (node.getInapplicableIf()!=null){
 820                         Set<State> addToiI = node.getInapplicableIf();
 821                         for (State state : addToiI){
 822                                 if (iIdependencies.containsKey(state)) {
 823                     iIdependencies.put(state, new HashSet<Feature>());
 824                 }
 825                                 iIdependencies.get(state).add(node.getTerm());
 826                         }
 827                 }
 828                 if (node.getChildNodes()!=null) {
 829                         for (FeatureNode fn : node.getChildNodes()){
 830                                 checkDependencies(fn);
 831                         }
 832                 }
 833         }
 834
 835
 836
 837         /**
 838          * This function fills the exclusions map.
 839          *
 840          * @param feature
 841          * @param coveredTaxa
 842          * @param exclusions
 843          * @return
 844          */
 845         private float featureScoreAndMerge(Feature feature, Set<TaxonDescription> coveredTaxa, Map<State,Set<State>> exclusions){
 846                 int i,j;
 847                 float score =0;
 848                 float power=0;
 849                 TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
 850                 for (i=0 ; i<coveredTaxaArray.length; i++){
 851                         Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
 852                         DescriptionElementBase deb1 = null;
 853                         for (DescriptionElementBase deb : elements1){
 854                                 if (deb.getFeature().equals(feature)){
 855                                         deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
 856                                 }
 857                         }
 858                         for (j=i+1 ; j< coveredTaxaArray.length ; j++){
 859                                 Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
 860                                 DescriptionElementBase deb2 = null;
 861                                 for (DescriptionElementBase deb : elements2){
 862                                         if (deb.getFeature().equals(feature)){
 863                                                 deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
 864                                         }
 865                                 }
 866                                 power = defaultPower(deb1,deb2);
 867                                 score = score + power;
 868                                 if (power>0) // if there is no state in common between deb1 and deb2
 869                                 {
 870                                         CategoricalData cat1 = (CategoricalData)deb1;
 871                                         CategoricalData cat2 = (CategoricalData)deb2;
 872                                         for (StateData statedata1 : cat1.getStateData()){
 873                                                 State state1 = statedata1.getState();
 874                                                 if (!exclusions.containsKey(state1)){
 875                                                         exclusions.put(state1, new HashSet<State>());
 876                                                 }
 877                                                 for (StateData statedata2 : cat2.getStateData()){
 878                                                         State state2 = statedata2.getState();
 879                                                         if (!exclusions.containsKey(state2)){
 880                                                                 exclusions.put(state2, new HashSet<State>());
 881                                                         }
 882                                                         exclusions.get(state1).add(state2);
 883                                                         exclusions.get(state2).add(state1);
 884                                                 }
 885                                         }
 886                                 }
 887                         }
 888                 }
 889                 return score;
 890         }
 891
 892
 893
 894         /**
 895          * Returns the score between two DescriptionElementBase. If one of them is null, returns -1.
 896          * If they are not of the same type (Categorical) returns 0.
 897          *
 898          * @param deb1
 899          * @param deb2
 900          * @return
 901          */
 902         private float defaultPower(DescriptionElementBase deb1, DescriptionElementBase deb2){
 903                 if (deb1==null || deb2==null) {
 904                         return -1; //what if the two taxa don't have this feature in common ?
 905                 }
 906                 if ((deb1.isInstanceOf(CategoricalData.class))&&(deb2.isInstanceOf(CategoricalData.class))) {
 907                         return defaultCategoricalPower((CategoricalData)deb1, (CategoricalData)deb2);
 908                 } else {
 909             return 0;
 910         }
 911         }
 912
 913         /**
 914          * Returns the score of a categorical feature.
 915          *
 916          * @param deb1
 917          * @param deb2
 918          * @return
 919          */
 920         private float defaultCategoricalPower(CategoricalData deb1, CategoricalData deb2){
 921                 List<StateData> states1 = deb1.getStateData();
 922                 List<StateData> states2 = deb2.getStateData();
 923                 boolean bool = false;
 924                 Iterator<StateData> stateData1Iterator = states1.iterator() ;
 925                 //              while (!bool && stateData1Iterator.hasNext()) {
 926                 //                      Iterator<StateData> stateData2Iterator = states2.iterator() ;
 927                 //                      StateData stateData1 = stateData1Iterator.next();
 928                 //                      while (!bool && stateData2Iterator.hasNext()) {
 929                 //                              bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
 930                 //                      }
 931                 //              }
 932                 // one point each time two taxa can be discriminated for a given feature
 933
 934                 boolean checkFeature = false;
 935
 936                 if (!featureDependencies.containsKey(deb1.getFeature())){
 937                         featureDependencies.put(deb1.getFeature(), new HashSet<Feature>());
 938                         checkFeature = true;
 939                 }
 940
 941                 while (stateData1Iterator.hasNext()) {
 942                         Iterator<StateData> stateData2Iterator = states2.iterator() ;
 943                         StateData stateData1 = stateData1Iterator.next();
 944                         State state1 = stateData1.getState();
 945                         if (checkFeature){
 946                                 if (iIdependencies.get(state1)!=null) {
 947                                         featureDependencies.get(deb1.getFeature()).addAll(iIdependencies.get(state1));
 948                                 }
 949                                 if (oAIdependencies.get(state1)!=null) {
 950                                         featureDependencies.get(deb1.getFeature()).addAll(oAIdependencies.get(state1));
 951                                 }
 952                         }
 953                         while (stateData2Iterator.hasNext()) {
 954                                 State state2 = stateData1.getState();
 955                                 bool = bool || state1.equals(state2); // checks if the states are the same
 956                         }
 957                 }
 958
 959
 960                 if (bool) {
 961             return 0;
 962         } else {
 963             return 1;
 964         }
 965         }
 966
 967         // old code used when PolytomousKey extended FeatureTree
 968         //      private void printTree(List<PolytomousKeyNode> fnodes, String spaces){
 969         //              if (!fnodes.isEmpty()){
 970         //                      level++;
 971         //                      int levelcopy = level;
 972         //                      int j=1;
 973         //                      String delimiter;
 974         //                      String equals = " = ";
 975         //                      String quantitative = "";
 976         //                      String newspaces = spaces.concat("\t");
 977         //                      for (PolytomousKeyNode fnode : fnodes){
 978         //                              if (fnode.getFeature()!=null) {
 979         //                                      String state = null;
 980         //                                      if (fnode.getQuestion(Language.DEFAULT())!=null) state = fnode.getQuestion(Language.DEFAULT()).getLabel();
 981         //                                      if (fnode.getFeature().isSupportsQuantitativeData()) delimiter = quantitative;
 982         //                                      else delimiter = equals;
 983         //                                      System.out.println(newspaces + levelcopy + " : " + j + " " + fnode.getFeature().getLabel() + delimiter + state);
 984         //                                      j++;
 985         //                              }
 986         //                              else { // TODO never read ?
 987         //                                      if (fnode.getQuestion(Language.DEFAULT())!=null) System.out.println(newspaces + "-> " + fnode.getQuestion(Language.DEFAULT()).getLabel());
 988         //                              }
 989         //                              printTree(fnode.getChildren(),newspaces);
 990         //                      }
 991         //              }
 992         //      }
 993
 994 }
 995