ref #8162 move FeatureTree and FeatureNode to term package
[cdmlib.git] / cdmlib-model / src / main / java / eu / etaxonomy / cdm / strategy / generate / PolytomousKeyGenerator.java
1 package eu.etaxonomy.cdm.strategy.generate;
2
3 import java.util.ArrayList;
4 import java.util.HashMap;
5 import java.util.HashSet;
6 import java.util.Iterator;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.Set;
10
11 import eu.etaxonomy.cdm.model.common.Language;
12 import eu.etaxonomy.cdm.model.description.CategoricalData;
13 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
14 import eu.etaxonomy.cdm.model.description.Feature;
15 import eu.etaxonomy.cdm.model.description.KeyStatement;
16 import eu.etaxonomy.cdm.model.description.PolytomousKey;
17 import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
18 import eu.etaxonomy.cdm.model.description.QuantitativeData;
19 import eu.etaxonomy.cdm.model.description.State;
20 import eu.etaxonomy.cdm.model.description.StateData;
21 import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
22 import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
23 import eu.etaxonomy.cdm.model.description.TaxonDescription;
24 import eu.etaxonomy.cdm.model.term.FeatureNode;
25 import eu.etaxonomy.cdm.model.term.FeatureTree;
26
27 /**
28 * @author m.venin
29 *
30 */
31 public class PolytomousKeyGenerator {
32
33 static int level=-1; // global variable needed by the printTree function in order to store the level which is being printed
34 private PolytomousKey polytomousKey; // the Identification Key
35 private List<Feature> features; // the features used to generate the key
36 private Set<TaxonDescription> taxa; // the base of taxa
37
38 private boolean merge=true; // if this boolean is set to true, branches of the tree will be merged if the corresponding states can be used together without decreasing their score
39
40 private FeatureTree dependenciesTree; // the tree containing the dependencies between states and features (InapplicableIf and OnlyApplicableIf)
41 private Map<State,Set<Feature>> iIdependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
42 private Map<State,Set<Feature>> oAIdependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
43 private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
44 private boolean dependenciesON = true; // if this boolean is true, the dependencies are taken into account
45
46
47 /**
48 * These strings are used for generating the statements of the key.
49 */
50 private String before="<";
51 private String after=">";
52 private String separator = " or ";
53
54 /**
55 * Sets the features used to generate the key.
56 *
57 * @param featuresList
58 */
59 public void setFeatures(List<Feature> featuresList){
60 this.features = featuresList;
61 }
62
63 /**
64 * Sets the base of taxa.
65 *
66 * @param featuresList
67 */
68 public void setTaxa(Set<TaxonDescription> taxaSet){
69 this.taxa = taxaSet;
70 }
71
72 /**
73 * Sets the tree containing the dependencies between states and features.
74 *
75 * @param tree
76 */
77 public void setDependencies(FeatureTree tree){
78 this.dependenciesTree = tree;
79 }
80
81 /**
82 * Allows the generator to use the dependencies given by the function "setDependencies".
83 */
84 public void turnDependenciesON(){
85 this.dependenciesON = true;
86 }
87
88 /**
89 * Prevent the generator from using dependencies.
90 */
91 public void turnDependenciesOFF(){
92 this.dependenciesON = false;
93 }
94
95 /**
96 * Allows the generator to merge branches if the corresponding states can be used together without diminishing their score.
97 */
98 public void mergeModeON(){
99 this.merge = true;
100 }
101
102 /**
103 * Prevent the generator from merging branches (apart from those leading to the same set of taxa).
104 */
105 public void mergeModeOFF(){
106 this.merge = false;
107 }
108
109
110 /**
111 * Initializes the function buildBranches() with the starting parameters in order to build the key
112 */
113 private void loop(){
114 polytomousKey = PolytomousKey.NewInstance();
115 PolytomousKeyNode root = polytomousKey.getRoot();
116 buildBranches(root,features,taxa,true);
117 }
118
119
120 /**
121 * Creates the key and prints it
122 */
123 public PolytomousKey invoke(){
124 if (dependenciesON && dependenciesTree!=null){
125 checkDependencies(dependenciesTree.getRoot());
126 }
127 loop();
128 List<PolytomousKeyNode> rootlist = new ArrayList<PolytomousKeyNode>();
129 rootlist.add(polytomousKey.getRoot());
130 // String spaces = new String();
131 // printTree(rootlist,spaces);
132 // System.out.println(paths.toString());
133 return polytomousKey;
134 }
135
136
137 /**
138 * Recursive function that builds the branches of the identification key (FeatureTree)
139 *
140 * @param father the node considered
141 * @param featuresLeft List of features that can be used at this point
142 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
143 * @param taxaAreTheSame if in the previous level the taxa discriminated are the same, this boolean is set to true, thus if the taxa, again, are not discriminated at this level the function stops
144 */
145 private void buildBranches(PolytomousKeyNode father, List<Feature> featuresLeft, Set<TaxonDescription> taxaCovered, boolean taxaDiscriminated){
146 boolean childrenExist = false; // boolean indicating if this node has children, if not, it means we reached a leaf and instead of adding a
147 // question or statement to it, we must add the list of taxa discriminated at this point
148
149 // These sets respectively store the features inapplicables and applicables at this point in the key
150 Set<Feature> innapplicables = new HashSet<Feature>();
151 Set<Feature> applicables = new HashSet<Feature>();
152
153 if (taxaCovered.size()>1) {
154 // this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
155 Map<Feature,Float> quantitativeFeaturesThresholds = new HashMap<Feature,Float>();
156 // the scores of the different features are calculated, the thresholds in the same time
157 Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
158 dependenciesScores(scoreMap,featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
159 // the feature with the best score becomes the one corresponding to the current node
160 Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
161 // the feature is removed from the list of features available to build the next level of the tree
162 featuresLeft.remove(winnerFeature);
163
164 int i;
165 /************** either the feature supports quantitative data... **************/
166 // NB: in this version, "quantitative features" are dealt with in a dichotomous way
167 if (winnerFeature!=null && winnerFeature.isSupportsQuantitativeData()) {
168 // first, get the threshold
169 float threshold = quantitativeFeaturesThresholds.get(winnerFeature);
170 String sign;
171 StringBuilder unit= new StringBuilder("");
172 // then determine which taxa are before and which are after this threshold (dichotomy) in order to create the children of the father node
173 List<Set<TaxonDescription>> quantitativeStates = determineQuantitativeStates(threshold,winnerFeature,taxaCovered,unit);
174 // thus the list contains two sets of TaxonDescription, the first corresponding to those before, the second to those after the threshold
175 for (i=0;i<2;i++) {
176 Set<TaxonDescription> newTaxaCovered = quantitativeStates.get(i);
177 if (i==0){
178 sign = before; // the first element of the list corresponds to taxa before the threshold
179 } else {
180 sign = after; // the second to those after
181 }
182 if (newTaxaCovered.size()>0 && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
183 childrenExist = true;
184 PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
185 son.setFeature(winnerFeature);
186 // old code used when PolytomousKey extended FeatureTree
187 // Representation question = new Representation(null, " " + sign + " " + threshold +unit,null, Language.DEFAULT()); // the question attribute is used to store the state of the feature
188 // son.addQuestion(question);
189 KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold +unit); // the question attribute is used to store the state of the feature
190 son.setStatement(statement);
191 father.addChild(son);
192 boolean areTheTaxaDiscriminated;
193 if (newTaxaCovered.size()==taxaCovered.size()) {
194 areTheTaxaDiscriminated = false;
195 } else {
196 areTheTaxaDiscriminated = true;
197 }
198 buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
199 }
200 }
201 }
202
203 /************** ...or it supports categorical data. **************/
204 // "categorical features" may present several different states, each one of these might correspond to one child
205 List<State> statesDone = new ArrayList<State>(); // the list of states already used
206 int numberOfStates;
207 if (winnerFeature!=null && winnerFeature.isSupportsCategoricalData()) {
208 for (TaxonDescription td : taxaCovered){
209 DescriptionElementBase debConcerned = null;
210
211 // first, get the right DescriptionElementBase
212 for (DescriptionElementBase deb : td.getElements()) {
213 if (deb.getFeature().equals(winnerFeature)) {
214 debConcerned = deb;
215 }
216 }
217
218 if (debConcerned!=null) {
219 // a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
220 Map<Set<TaxonDescription>,List<State>> taxonStatesMap = determineCategoricalStates(statesDone,(CategoricalData)debConcerned,winnerFeature,taxaCovered);
221 // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
222 if (merge){
223 // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
224 Map<State,Set<State>> exclusions = new HashMap<State,Set<State>>();
225 featureScoreAndMerge(winnerFeature,taxaCovered,exclusions);
226
227 while (!exclusions.isEmpty()){
228 // looks for the largest clique, i.e. the state with less exclusions
229 List<State> clique = returnBestClique(exclusions);
230 // then merges the corresponding branches
231 mergeBranches(clique,taxonStatesMap);
232 }
233 }
234 if (taxonStatesMap!=null && !taxonStatesMap.isEmpty()) {
235 for (Map.Entry<Set<TaxonDescription>,List<State>> entry : taxonStatesMap.entrySet()){
236 Set<TaxonDescription> newTaxaCovered = entry.getKey();
237 List<State> listOfStates = entry.getValue();
238 if ((newTaxaCovered.size()>0) && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
239 childrenExist = true;
240 PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
241 StringBuilder questionLabel = new StringBuilder();
242 numberOfStates = listOfStates.size()-1;
243 for (State state : listOfStates) {
244 if (dependenciesON){
245 // if the dependencies are considered, removes and adds the right features from/to the list of features left
246 // these features are stored in order to be put back again when the current branch is finished
247 if (iIdependencies.get(state)!= null) {
248 innapplicables.addAll(iIdependencies.get(state));
249 }
250 if (oAIdependencies.get(state)!= null) {
251 applicables.addAll(oAIdependencies.get(state));
252 }
253 for (Feature feature : innapplicables) {
254 featuresLeft.remove(feature);
255 }
256 for (Feature feature : applicables) {
257 featuresLeft.add(feature);
258 }
259 }
260 questionLabel.append(state.getLabel());
261 if (listOfStates.lastIndexOf(state)!=numberOfStates)
262 {
263 questionLabel.append(separator); // append a separator after each state except for the last one
264 }
265 }
266 // old code used when PolytomousKey extended FeatureTree
267 // Representation question = new Representation(null, questionLabel.toString(),null, Language.DEFAULT());
268 // son.addQuestion(question);
269 KeyStatement statement = KeyStatement.NewInstance(questionLabel.toString());
270 son.setStatement(statement);
271 son.setFeature(winnerFeature);
272 father.addChild(son);
273 featuresLeft.remove(winnerFeature); // unlike for quantitative features, once a categorical one has been used, it cannot be reused in the same branch
274 boolean areTheTaxaDiscriminated;
275 if (newTaxaCovered.size()==taxaCovered.size()) {
276 areTheTaxaDiscriminated = true;
277 } else {
278 areTheTaxaDiscriminated = false;
279 }
280 buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
281 }
282 }
283 }
284 }
285 }
286 }
287 // the features depending on other features are added/removed to/from the features left once the branch is done
288 if (dependenciesON){
289 for (Feature feature : innapplicables) {
290 featuresLeft.add(feature);
291 }
292 for (Feature feature : applicables) {
293 featuresLeft.remove(feature);
294 }
295 }
296 // the winner features are put back to the features left once the branch is done
297 featuresLeft.add(winnerFeature);
298 }
299 // if the node is a leaf, the statement contains the list of taxa to which the current leaf leads.
300 if (!childrenExist){
301 KeyStatement fatherStatement = father.getStatement();
302 if (fatherStatement!=null){
303 String statementString = fatherStatement.getLabelText(Language.DEFAULT());
304 if (statementString !=null && taxaCovered != null){
305 String label = statementString + " --> " + taxaCovered.toString();
306 fatherStatement.putLabel(Language.DEFAULT(), label);
307 }
308 }
309 }
310 }
311
312
313
314 /**
315 * Dependencies can be seen as a hierarchy.
316 * If the dependencies are on, this function calculates the score of all children features and give the highest score
317 * of these to their father (of course, if its score is lower). This way, if a feature has a good score but is
318 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
319 *
320 * @param scoreMap
321 * @param featuresLeft
322 * @param coveredTaxa
323 * @param quantitativeFeaturesThresholds
324 */
325 private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
326 // first copies the list of features left
327 List<Feature> pseudoFeaturesLeft = featuresLeft.subList(0, featuresLeft.size()-1);
328 // then adds all the features which depend on features contained in the list
329 for (Feature feature : pseudoFeaturesLeft){
330 if (featureDependencies.containsKey(feature)){
331 for (Feature feature2 : featureDependencies.get(feature)){
332 if (!pseudoFeaturesLeft.contains(feature2)){
333 pseudoFeaturesLeft.add(feature2);
334 }
335 }
336 }
337 }
338 // then calculates the scores of all features that have been added
339 Map<Feature,Float> newScoreMap = featureScores(pseudoFeaturesLeft, coveredTaxa, quantitativeFeaturesThresholds);
340 for (Feature feature : featureDependencies.keySet()){
341 if (newScoreMap.containsKey(feature)){
342 for (Feature feature2 : featureDependencies.get(feature)){
343 if (newScoreMap.containsKey(feature2)) {
344 // if a feature has a better than the "father" feature it depends on
345 if (newScoreMap.get(feature2)>newScoreMap.get(feature)){
346 // the "father" feature gets its score
347 scoreMap.put(feature, newScoreMap.get(feature2));
348 }
349 }
350 }
351 }
352 }
353 }
354
355
356 /**
357 * This function merges branches of the key belonging to the same clique
358 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
359 *
360 * @param clique the list of States linked together (i.e. if merged have the same score)
361 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
362 */
363 private void mergeBranches(List<State> clique, Map<Set<TaxonDescription>, List<State>> taxonStatesMap){
364 boolean stateFound;
365 Map.Entry<Set<TaxonDescription>,List<State>> firstBranch=null;
366 List<Set<TaxonDescription>> tdToDelete = new ArrayList<Set<TaxonDescription>>();
367
368 if (clique.size()>1){
369 Iterator<Map.Entry<Set<TaxonDescription>, List<State>>> it1 = taxonStatesMap.entrySet().iterator();
370 while (it1.hasNext()){
371 Map.Entry<Set<TaxonDescription>,List<State>> branch = it1.next();
372 Iterator<State> stateIterator = clique.iterator();
373 stateFound=false;
374 // looks for one state of the clique in this branch
375 while(stateIterator.hasNext() && stateFound!=true) {
376 State state = stateIterator.next();
377 if (branch.getValue().contains(state)) {
378 stateFound=true;
379 }
380 }
381 // if one state is found...
382 if (stateFound==true){
383 // ...for the first time, the branch becomes the one to which the others will be merged
384 if (firstBranch==null){
385 firstBranch=branch;
386 }
387 // ... else the branch is merged to the first one.
388 else {
389 firstBranch.getKey().addAll(branch.getKey());
390 firstBranch.getValue().addAll(branch.getValue());
391 tdToDelete.add(branch.getKey());
392 }
393 }
394 }
395 // once this is done, the branches merged to the first one are deleted
396 for (Set<TaxonDescription> td : tdToDelete){
397 taxonStatesMap.remove(td);
398 }
399 }
400 }
401
402
403
404 /**
405 * This function looks for the largest clique of States
406 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
407 * from the map of exclusions.
408 *
409 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
410 * @return
411 */
412 private List<State> returnBestClique (Map<State,Set<State>> exclusions){
413 int bestNumberOfExclusions=-1;;
414 int numberOfExclusions;
415 List<State> clique = new ArrayList<State>();
416
417 // looks for the largest clique, i.e. the state with less exclusions
418 State bestState=null;
419 for (Iterator<Map.Entry<State,Set<State>>> it1 = exclusions.entrySet().iterator() ; it1.hasNext();){
420 Map.Entry<State,Set<State>> pair = it1.next();
421 numberOfExclusions = pair.getValue().size();
422 if ((bestNumberOfExclusions==-1) || numberOfExclusions<bestNumberOfExclusions) {
423 bestNumberOfExclusions=numberOfExclusions;
424 bestState = pair.getKey();
425 }
426 }
427
428 clique.add(bestState);
429 exclusions.remove(bestState);
430
431 boolean isNotExcluded;
432 // then starts building the clique by adding the states which do not exclude each other
433 for (Iterator<Map.Entry<State,Set<State>>> iterator = exclusions.entrySet().iterator() ; iterator.hasNext();){
434 Map.Entry<State,Set<State>> pair = iterator.next();
435 isNotExcluded = true;
436 for (State state : clique) {
437 if (pair.getValue().contains(state)) {
438 isNotExcluded = false;
439 }
440 }
441 if (isNotExcluded){
442 clique.add(pair.getKey());
443 }
444 }
445 for (State state : clique) {
446 exclusions.remove(state);
447 }
448 return clique;
449 }
450
451
452 /**
453 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
454 *
455 * @param statesDone the list of states already done for this feature
456 * @param categoricalData the element from which the states are extracted
457 * @param feature the feature corresponding to the CategoricalData
458 * @param taxaCovered the base of taxa considered
459 * @return
460 */
461 private Map<Set<TaxonDescription>,List<State>> determineCategoricalStates(List<State> statesDone, CategoricalData categoricalData, Feature feature, Set<TaxonDescription> taxaCovered){
462 Map<Set<TaxonDescription>,List<State>> childrenStatesMap = new HashMap<Set<TaxonDescription>,List<State>>();
463
464 List<StateData> stateDatas = categoricalData.getStateData();
465
466 List<State> states = new ArrayList<State>();
467 for (StateData sd : stateDatas){
468 states.add(sd.getState());
469 }
470
471 for (State featureState : states){ // for each state
472 if(!statesDone.contains(featureState)){ // if the state hasn't already be considered
473 statesDone.add(featureState);
474 Set<TaxonDescription> newTaxaCovered = whichTaxa(feature,featureState,taxaCovered); //gets which taxa present this state
475 List<State> newStates = childrenStatesMap.get(newTaxaCovered);
476 if (newStates==null) { // if no states are associated to these taxa, create a new list
477 newStates = new ArrayList<State>();
478 childrenStatesMap.put(newTaxaCovered,newStates);
479 }
480 newStates.add(featureState); // then add the state to the list
481 }
482 }
483 return childrenStatesMap;
484 }
485
486
487 /**
488 * returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
489 *
490 * @param feature
491 * @param featureState
492 * @param taxaCovered
493 * @return
494 */
495 private Set<TaxonDescription> whichTaxa(Feature feature, State featureState, Set<TaxonDescription> taxaCovered){
496 Set<TaxonDescription> newCoveredTaxa = new HashSet<TaxonDescription>();
497 for (TaxonDescription td : taxaCovered){
498 Set<DescriptionElementBase> elements = td.getElements();
499 for (DescriptionElementBase deb : elements){
500 if (deb.isInstanceOf(CategoricalData.class)) {
501 if (deb.getFeature().equals(feature)) {
502 List<StateData> stateDatas = ((CategoricalData)deb).getStateData();
503 for (StateData sd : stateDatas) {
504 if (sd.getState().equals(featureState)){
505 newCoveredTaxa.add(td);
506 }
507 }
508 }
509 }
510 }
511 }
512 return newCoveredTaxa;
513 }
514
515
516 /**
517 * This function returns the feature with the highest score. However, if several features have the same score
518 * the one wich leads to less options is chosen (this way, the key is easier to read).
519 *
520 * @param nTaxa
521 * @param scores
522 * @param taxaCovered
523 * @return
524 */
525 private Feature lessStatesWinner(Map<Feature,Float> scores, Set<TaxonDescription> taxaCovered){
526 int nTaxa = taxaCovered.size();
527 if (nTaxa==1) {
528 return null;
529 }
530 float meanScore = defaultMeanScore(nTaxa);
531 float bestScore = nTaxa*nTaxa;
532 List<Feature> bestFeatures = new ArrayList<Feature>(); // if ever different features have the best score, they are put in this list
533 Feature bestFeature = null;
534 Iterator<Map.Entry<Feature,Float>> it = scores.entrySet().iterator();
535 float newScore;
536 while (it.hasNext()){
537 Map.Entry<Feature,Float> pair = it.next();
538 if (pair.getValue()!=null){
539 newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
540 // a feature would have if it divided the taxa in two equal parts
541 if (newScore < bestScore){
542 bestFeatures.clear();
543 bestFeatures.add(pair.getKey());
544 bestScore = newScore;
545 }else if (newScore==bestScore){
546 bestFeatures.add(pair.getKey());
547 }
548 }
549 }
550 if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
551 return bestFeatures.get(0);
552 }
553 else { // else choose the one with less states
554 int lessStates=-1;
555 int numberOfDifferentStates=-1;
556 for (Feature feature : bestFeatures){
557 if (feature.isSupportsCategoricalData()){
558 Set<State> differentStates = new HashSet<State>();
559 for (TaxonDescription td : taxaCovered){
560 Set<DescriptionElementBase> elements = td.getElements();
561 for (DescriptionElementBase deb : elements){
562 if (deb.isInstanceOf(CategoricalData.class)) {
563 CategoricalData catdat = (CategoricalData)deb;
564 if (catdat.getFeature().equals(feature)) {
565 List<StateData> stateDatas = catdat.getStateData();
566 for (StateData sd : stateDatas) {
567 differentStates.add(sd.getState());
568 }
569 }
570 }
571 }
572 }
573 numberOfDifferentStates=differentStates.size();
574 }else if (feature.isSupportsQuantitativeData()){
575 numberOfDifferentStates=2;
576 }
577 if (lessStates==-1 || numberOfDifferentStates<lessStates){
578 lessStates=numberOfDifferentStates;
579 bestFeature = feature;
580 }
581 }
582 return bestFeature;
583 }
584 }
585
586
587 /**
588 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
589 * would have.
590 *
591 * @param nTaxa
592 * @return
593 */
594 private float defaultMeanScore(int nTaxa){
595 int i;
596 float score=0;
597 for (i=1;i<nTaxa;i++){
598 score = score + Math.round(i+1/2);
599 }
600 return score;
601 }
602
603
604 /**
605 * This function fills the map of features (keys) with their respecting scores (values)
606 *
607 * @param featuresLeft
608 * @param coveredTaxa
609 * @param quantitativeFeaturesThresholds
610 * @return
611 */
612 private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
613 Map<Feature,Float> scoreMap = new HashMap<Feature,Float>();
614 for (Feature feature : featuresLeft){
615 if (feature.isSupportsCategoricalData()) {
616 scoreMap.put(feature, categoricalFeatureScore(feature,coveredTaxa));
617 }
618 if (feature.isSupportsQuantitativeData()){
619 scoreMap.put(feature, quantitativeFeatureScore(feature,coveredTaxa, quantitativeFeaturesThresholds));
620 }
621 }
622 return scoreMap;
623 }
624
625 /**
626 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
627 * for a given quantitative feature, present either a lower or higher value than a given threshold.
628 * It returns two Sets of TaxonDescription, one with the taxa under this threshold (taxaBefore) and another one
629 * with the taxa over (taxaAfter).
630 *
631 * @param threshold
632 * @param feature
633 * @param taxa
634 * @param unit
635 * @return
636 */
637 private List<Set<TaxonDescription>> determineQuantitativeStates (Float threshold, Feature feature, Set<TaxonDescription> taxa, StringBuilder unit){
638 List<Set<TaxonDescription>> list = new ArrayList<Set<TaxonDescription>>();
639 Set<TaxonDescription> taxaBefore = new HashSet<TaxonDescription>();
640 Set<TaxonDescription> taxaAfter = new HashSet<TaxonDescription>();
641 list.add(taxaBefore);
642 list.add(taxaAfter);
643 for (TaxonDescription td : taxa){
644 Set<DescriptionElementBase> elements = td.getElements();
645 for (DescriptionElementBase deb : elements){
646 if (deb.getFeature().equals(feature)) {
647 if (deb.isInstanceOf(QuantitativeData.class)) {
648 QuantitativeData qd = (QuantitativeData)deb;
649 if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
650 unit.append(" " + qd.getUnit().getLabel());
651 }
652 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
653 for (StatisticalMeasurementValue smv : values){
654 StatisticalMeasure type = smv.getType();
655 //TODO DONT FORGET sample size, MEAN etc
656 if (type.equals(StatisticalMeasure.MAX()) || type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
657 if (smv.getValue()>threshold){
658 taxaAfter.add(td);
659 }
660 }
661 if (type.equals(StatisticalMeasure.MIN()) || type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
662 if (smv.getValue()<=threshold){
663 taxaBefore.add(td);
664 }
665 }
666 }
667 }
668 }
669 }
670 }
671 return list;
672 }
673
674
675
676 /**
677 * This function returns the score of a quantitative feature.
678 *
679 * @param feature
680 * @param coveredTaxa
681 * @param quantitativeFeaturesThresholds
682 * @return
683 */
684 private float quantitativeFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
685 List<Float> allValues = new ArrayList<Float>();
686 boolean lowerboundarypresent;
687 boolean upperboundarypresent;
688 float lowerboundary=0;
689 float upperboundary=0;
690 for (TaxonDescription td : coveredTaxa){
691 Set<DescriptionElementBase> elements = td.getElements();
692 for (DescriptionElementBase deb : elements){
693 if (deb.getFeature().equals(feature)) {
694 if (deb.isInstanceOf(QuantitativeData.class)) {
695 QuantitativeData qd = (QuantitativeData)deb;
696 Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
697 lowerboundarypresent = false;
698 upperboundarypresent = false;
699 for (StatisticalMeasurementValue smv : values){
700 StatisticalMeasure type = smv.getType();
701 // DONT FORGET sample size, MEAN etc
702 if (type.equals(StatisticalMeasure.MAX())) {
703 upperboundary = smv.getValue();
704 upperboundarypresent=true;
705 }
706 if (type.equals(StatisticalMeasure.MIN())) {
707 lowerboundary = smv.getValue();
708 lowerboundarypresent=true;
709 }
710 if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
711 upperboundary = smv.getValue();
712 upperboundarypresent=true;
713 }
714 if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
715 lowerboundary = smv.getValue();
716 lowerboundarypresent=true;
717 }
718 if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
719 lowerboundary = smv.getValue();
720 upperboundary = lowerboundary;
721 lowerboundarypresent=true;
722 upperboundarypresent=true;
723 }
724 }
725 if (lowerboundarypresent && upperboundarypresent) {
726 allValues.add(lowerboundary);
727 allValues.add(upperboundary);
728 }
729 }
730 }
731 }
732 }
733 int i,j;
734 float threshold=0;
735 float bestThreshold=0;
736 int difference=allValues.size();
737 int differenceMin = difference;
738 int taxaBefore=0;
739 int taxaAfter=0;
740 for (i=0;i<allValues.size()/2;i++) {
741 threshold = allValues.get(i*2+1);
742 taxaBefore=0;
743 taxaAfter=0;
744 for (j=0;j<allValues.size()/2;j++) {
745 if (allValues.get(j*2+1)<=threshold){
746 taxaBefore++;
747 }
748 if (allValues.get(j*2)>threshold){
749 taxaAfter++;
750 }
751 }
752 difference = Math.abs(taxaBefore-taxaAfter);
753 if (difference<differenceMin){
754 differenceMin=difference;
755 bestThreshold = threshold;
756 }
757 }
758 quantitativeFeaturesThresholds.put(feature, bestThreshold);
759 int defaultQuantitativeScore=0;
760 for (i=0;i<taxaBefore;i++) {
761 defaultQuantitativeScore += taxaAfter - i;
762 }
763 return (defaultQuantitativeScore);
764 }
765
766
767
768 /**
769 * This function returns the score of a categorical feature.
770 *
771 * @param feature
772 * @param coveredTaxa
773 * @return
774 */
775 private float categoricalFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa){
776 int i,j;
777 float score =0;
778 float power=0;
779 TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
780 for (i=0 ; i<coveredTaxaArray.length; i++){
781 Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
782 DescriptionElementBase deb1 = null;
783 for (DescriptionElementBase deb : elements1){
784 if (deb.getFeature().equals(feature)){
785 deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
786 }
787 }
788 for (j=i+1 ; j< coveredTaxaArray.length ; j++){
789 Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
790 DescriptionElementBase deb2 = null;
791 for (DescriptionElementBase deb : elements2){
792 if (deb.getFeature().equals(feature)){
793 deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
794 }
795 }
796 power = defaultPower(deb1,deb2);
797 score = score + power;
798 }
799 }
800 return score;
801 }
802
803
804 /**
805 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
806 *
807 * @param node
808 */
809 private void checkDependencies(FeatureNode<Feature> node){
810 if (node.getOnlyApplicableIf()!=null){
811 Set<State> addToOAI = node.getOnlyApplicableIf();
812 for (State state : addToOAI){
813 if (oAIdependencies.containsKey(state)) {
814 oAIdependencies.put(state, new HashSet<Feature>());
815 }
816 oAIdependencies.get(state).add(node.getTerm());
817 }
818 }
819 if (node.getInapplicableIf()!=null){
820 Set<State> addToiI = node.getInapplicableIf();
821 for (State state : addToiI){
822 if (iIdependencies.containsKey(state)) {
823 iIdependencies.put(state, new HashSet<Feature>());
824 }
825 iIdependencies.get(state).add(node.getTerm());
826 }
827 }
828 if (node.getChildNodes()!=null) {
829 for (FeatureNode fn : node.getChildNodes()){
830 checkDependencies(fn);
831 }
832 }
833 }
834
835
836
837 /**
838 * This function fills the exclusions map.
839 *
840 * @param feature
841 * @param coveredTaxa
842 * @param exclusions
843 * @return
844 */
845 private float featureScoreAndMerge(Feature feature, Set<TaxonDescription> coveredTaxa, Map<State,Set<State>> exclusions){
846 int i,j;
847 float score =0;
848 float power=0;
849 TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
850 for (i=0 ; i<coveredTaxaArray.length; i++){
851 Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
852 DescriptionElementBase deb1 = null;
853 for (DescriptionElementBase deb : elements1){
854 if (deb.getFeature().equals(feature)){
855 deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
856 }
857 }
858 for (j=i+1 ; j< coveredTaxaArray.length ; j++){
859 Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
860 DescriptionElementBase deb2 = null;
861 for (DescriptionElementBase deb : elements2){
862 if (deb.getFeature().equals(feature)){
863 deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
864 }
865 }
866 power = defaultPower(deb1,deb2);
867 score = score + power;
868 if (power>0) // if there is no state in common between deb1 and deb2
869 {
870 CategoricalData cat1 = (CategoricalData)deb1;
871 CategoricalData cat2 = (CategoricalData)deb2;
872 for (StateData statedata1 : cat1.getStateData()){
873 State state1 = statedata1.getState();
874 if (!exclusions.containsKey(state1)){
875 exclusions.put(state1, new HashSet<State>());
876 }
877 for (StateData statedata2 : cat2.getStateData()){
878 State state2 = statedata2.getState();
879 if (!exclusions.containsKey(state2)){
880 exclusions.put(state2, new HashSet<State>());
881 }
882 exclusions.get(state1).add(state2);
883 exclusions.get(state2).add(state1);
884 }
885 }
886 }
887 }
888 }
889 return score;
890 }
891
892
893
894 /**
895 * Returns the score between two DescriptionElementBase. If one of them is null, returns -1.
896 * If they are not of the same type (Categorical) returns 0.
897 *
898 * @param deb1
899 * @param deb2
900 * @return
901 */
902 private float defaultPower(DescriptionElementBase deb1, DescriptionElementBase deb2){
903 if (deb1==null || deb2==null) {
904 return -1; //what if the two taxa don't have this feature in common ?
905 }
906 if ((deb1.isInstanceOf(CategoricalData.class))&&(deb2.isInstanceOf(CategoricalData.class))) {
907 return defaultCategoricalPower((CategoricalData)deb1, (CategoricalData)deb2);
908 } else {
909 return 0;
910 }
911 }
912
913 /**
914 * Returns the score of a categorical feature.
915 *
916 * @param deb1
917 * @param deb2
918 * @return
919 */
920 private float defaultCategoricalPower(CategoricalData deb1, CategoricalData deb2){
921 List<StateData> states1 = deb1.getStateData();
922 List<StateData> states2 = deb2.getStateData();
923 boolean bool = false;
924 Iterator<StateData> stateData1Iterator = states1.iterator() ;
925 // while (!bool && stateData1Iterator.hasNext()) {
926 // Iterator<StateData> stateData2Iterator = states2.iterator() ;
927 // StateData stateData1 = stateData1Iterator.next();
928 // while (!bool && stateData2Iterator.hasNext()) {
929 // bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
930 // }
931 // }
932 // one point each time two taxa can be discriminated for a given feature
933
934 boolean checkFeature = false;
935
936 if (!featureDependencies.containsKey(deb1.getFeature())){
937 featureDependencies.put(deb1.getFeature(), new HashSet<Feature>());
938 checkFeature = true;
939 }
940
941 while (stateData1Iterator.hasNext()) {
942 Iterator<StateData> stateData2Iterator = states2.iterator() ;
943 StateData stateData1 = stateData1Iterator.next();
944 State state1 = stateData1.getState();
945 if (checkFeature){
946 if (iIdependencies.get(state1)!=null) {
947 featureDependencies.get(deb1.getFeature()).addAll(iIdependencies.get(state1));
948 }
949 if (oAIdependencies.get(state1)!=null) {
950 featureDependencies.get(deb1.getFeature()).addAll(oAIdependencies.get(state1));
951 }
952 }
953 while (stateData2Iterator.hasNext()) {
954 State state2 = stateData1.getState();
955 bool = bool || state1.equals(state2); // checks if the states are the same
956 }
957 }
958
959
960 if (bool) {
961 return 0;
962 } else {
963 return 1;
964 }
965 }
966
967 // old code used when PolytomousKey extended FeatureTree
968 // private void printTree(List<PolytomousKeyNode> fnodes, String spaces){
969 // if (!fnodes.isEmpty()){
970 // level++;
971 // int levelcopy = level;
972 // int j=1;
973 // String delimiter;
974 // String equals = " = ";
975 // String quantitative = "";
976 // String newspaces = spaces.concat("\t");
977 // for (PolytomousKeyNode fnode : fnodes){
978 // if (fnode.getFeature()!=null) {
979 // String state = null;
980 // if (fnode.getQuestion(Language.DEFAULT())!=null) state = fnode.getQuestion(Language.DEFAULT()).getLabel();
981 // if (fnode.getFeature().isSupportsQuantitativeData()) delimiter = quantitative;
982 // else delimiter = equals;
983 // System.out.println(newspaces + levelcopy + " : " + j + " " + fnode.getFeature().getLabel() + delimiter + state);
984 // j++;
985 // }
986 // else { // TODO never read ?
987 // if (fnode.getQuestion(Language.DEFAULT())!=null) System.out.println(newspaces + "-> " + fnode.getQuestion(Language.DEFAULT()).getLabel());
988 // }
989 // printTree(fnode.getChildren(),newspaces);
990 // }
991 // }
992 // }
993
994 }
995