Project

General

Profile

Download (37.7 KB) Statistics
| Branch: | Tag: | Revision:
1
package eu.etaxonomy.cdm.strategy.generate;
2

    
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.HashSet;
6
import java.util.Iterator;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.Set;
10

    
11
import eu.etaxonomy.cdm.model.common.Language;
12
import eu.etaxonomy.cdm.model.description.CategoricalData;
13
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
14
import eu.etaxonomy.cdm.model.description.Feature;
15
import eu.etaxonomy.cdm.model.description.KeyStatement;
16
import eu.etaxonomy.cdm.model.description.PolytomousKey;
17
import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
18
import eu.etaxonomy.cdm.model.description.QuantitativeData;
19
import eu.etaxonomy.cdm.model.description.State;
20
import eu.etaxonomy.cdm.model.description.StateData;
21
import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
22
import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
23
import eu.etaxonomy.cdm.model.description.TaxonDescription;
24
import eu.etaxonomy.cdm.model.term.FeatureNode;
25
import eu.etaxonomy.cdm.model.term.FeatureTree;
26

    
27
/**
28
 * @author m.venin
29
 *
30
 */
31
public class PolytomousKeyGenerator {
32

    
33
	static int level=-1; // global variable needed by the printTree function in order to store the level which is being printed
34
	private PolytomousKey polytomousKey; // the Identification Key
35
	private List<Feature> features; // the features used to generate the key
36
	private Set<TaxonDescription> taxa; // the base of taxa
37

    
38
	private boolean merge=true; // if this boolean is set to true, branches of the tree will be merged if the corresponding states can be used together without decreasing their score
39

    
40
	private FeatureTree dependenciesTree; // the tree containing the dependencies between states and features (InapplicableIf and OnlyApplicableIf)
41
	private Map<State,Set<Feature>> iIdependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
42
	private Map<State,Set<Feature>> oAIdependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
43
	private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
44
	private boolean dependenciesON = true; // if this boolean is true, the dependencies are taken into account
45

    
46

    
47
	/**
48
	 * These strings are used for generating the statements of the key.
49
	 */
50
	private String before="<";
51
	private String after=">";
52
	private String separator = " or ";
53

    
54
	/**
55
	 * Sets the features used to generate the key.
56
	 *
57
	 * @param featuresList
58
	 */
59
	public void setFeatures(List<Feature> featuresList){
60
		this.features = featuresList;
61
	}
62

    
63
	/**
64
	 * Sets the base of taxa.
65
	 *
66
	 * @param featuresList
67
	 */
68
	public void setTaxa(Set<TaxonDescription> taxaSet){
69
		this.taxa = taxaSet;
70
	}
71

    
72
	/**
73
	 * Sets the tree containing the dependencies between states and features.
74
	 *
75
	 * @param tree
76
	 */
77
	public void setDependencies(FeatureTree tree){
78
		this.dependenciesTree = tree;
79
	}
80

    
81
	/**
82
	 * Allows the generator to use the dependencies given by the function "setDependencies".
83
	 */
84
	public void turnDependenciesON(){
85
		this.dependenciesON = true;
86
	}
87

    
88
	/**
89
	 * Prevent the generator from using dependencies.
90
	 */
91
	public void turnDependenciesOFF(){
92
		this.dependenciesON = false;
93
	}
94

    
95
	/**
96
	 * Allows the generator to merge branches if the corresponding states can be used together without diminishing their score.
97
	 */
98
	public void mergeModeON(){
99
		this.merge = true;
100
	}
101

    
102
	/**
103
	 * Prevent the generator from merging branches (apart from those leading to the same set of taxa).
104
	 */
105
	public void mergeModeOFF(){
106
		this.merge = false;
107
	}
108

    
109

    
110
	/**
111
	 * Initializes the function buildBranches() with the starting parameters in order to build the key
112
	 */
113
	private void loop(){
114
		polytomousKey = PolytomousKey.NewInstance();
115
		PolytomousKeyNode root = polytomousKey.getRoot();
116
		buildBranches(root,features,taxa,true);
117
	}
118

    
119

    
120
	/**
121
	 * Creates the key and prints it
122
	 */
123
	public PolytomousKey invoke(){
124
		if (dependenciesON && dependenciesTree!=null){
125
			checkDependencies(dependenciesTree.getRoot());
126
		}
127
		loop();
128
		List<PolytomousKeyNode> rootlist = new ArrayList<PolytomousKeyNode>();
129
		rootlist.add(polytomousKey.getRoot());
130
		//		String spaces = new String();
131
		//		printTree(rootlist,spaces);
132
		//		System.out.println(paths.toString());
133
		return polytomousKey;
134
	}
135

    
136

    
137
	/**
138
	 * Recursive function that builds the branches of the identification key (FeatureTree)
139
	 *
140
	 * @param father the node considered
141
	 * @param featuresLeft List of features that can be used at this point
142
	 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
143
	 * @param taxaAreTheSame if in the previous level the taxa discriminated are the same, this boolean is set to true, thus if the taxa, again, are not discriminated at this level the function stops
144
	 */
145
	private void buildBranches(PolytomousKeyNode father, List<Feature> featuresLeft, Set<TaxonDescription> taxaCovered, boolean taxaDiscriminated){
146
		boolean childrenExist = false; // boolean indicating if this node has children, if not, it means we reached a leaf and instead of adding a
147
		// question or statement to it, we must add the list of taxa discriminated at this point
148

    
149
		// These sets respectively store the features inapplicables and applicables at this point in the key
150
		Set<Feature> innapplicables = new HashSet<Feature>();
151
		Set<Feature> applicables = new HashSet<Feature>();
152

    
153
		if (taxaCovered.size()>1) {
154
			// this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
155
			Map<Feature,Float> quantitativeFeaturesThresholds = new HashMap<Feature,Float>();
156
			// the scores of the different features are calculated, the thresholds in the same time
157
			Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
158
			dependenciesScores(scoreMap,featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
159
			// the feature with the best score becomes the one corresponding to the current node
160
			Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
161
			// the feature is removed from the list of features available to build the next level of the tree
162
			featuresLeft.remove(winnerFeature);
163

    
164
			int i;
165
			/************** either the feature supports quantitative data... **************/
166
			// NB: in this version, "quantitative features" are dealt with in a dichotomous way
167
			if (winnerFeature!=null && winnerFeature.isSupportsQuantitativeData()) {
168
				// first, get the threshold
169
				float threshold = quantitativeFeaturesThresholds.get(winnerFeature);
170
				String sign;
171
				StringBuilder unit= new StringBuilder("");
172
				// then determine which taxa are before and which are after this threshold (dichotomy) in order to create the children of the father node
173
				List<Set<TaxonDescription>> quantitativeStates = determineQuantitativeStates(threshold,winnerFeature,taxaCovered,unit);
174
				// thus the list contains two sets of TaxonDescription, the first corresponding to those before, the second to those after the threshold
175
				for (i=0;i<2;i++) {
176
					Set<TaxonDescription> newTaxaCovered = quantitativeStates.get(i);
177
					if (i==0){
178
						sign = before; // the first element of the list corresponds to taxa before the threshold
179
					} else {
180
						sign = after; // the second to those after
181
					}
182
					if (newTaxaCovered.size()>0 && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
183
						childrenExist = true;
184
						PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
185
						son.setFeature(winnerFeature);
186
						// old code used when PolytomousKey extended FeatureTree
187
						//						Representation question = new Representation(null, " " + sign + " " + threshold +unit,null, Language.DEFAULT()); // the question attribute is used to store the state of the feature
188
						//						son.addQuestion(question);
189
						KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold +unit); // the question attribute is used to store the state of the feature
190
						son.setStatement(statement);
191
						father.addChild(son);
192
						boolean areTheTaxaDiscriminated;
193
						if (newTaxaCovered.size()==taxaCovered.size()) {
194
                            areTheTaxaDiscriminated = false;
195
                        } else {
196
                            areTheTaxaDiscriminated = true;
197
                        }
198
						buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
199
					}
200
				}
201
			}
202

    
203
			/************** ...or it supports categorical data. **************/
204
			// "categorical features" may present several different states, each one of these might correspond to one child
205
			List<State> statesDone = new ArrayList<State>(); // the list of states already used
206
			int numberOfStates;
207
			if (winnerFeature!=null && winnerFeature.isSupportsCategoricalData()) {
208
				for (TaxonDescription td : taxaCovered){
209
					DescriptionElementBase debConcerned = null;
210

    
211
					// first, get the right DescriptionElementBase
212
					for (DescriptionElementBase deb : td.getElements()) {
213
						if (deb.getFeature().equals(winnerFeature)) {
214
                            debConcerned = deb;
215
                        }
216
					}
217

    
218
					if (debConcerned!=null) {
219
						// a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
220
						Map<Set<TaxonDescription>,List<State>> taxonStatesMap = determineCategoricalStates(statesDone,(CategoricalData)debConcerned,winnerFeature,taxaCovered);
221
						// if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
222
						if (merge){
223
							// creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
224
							Map<State,Set<State>> exclusions = new HashMap<State,Set<State>>();
225
							featureScoreAndMerge(winnerFeature,taxaCovered,exclusions);
226

    
227
							while (!exclusions.isEmpty()){
228
								// looks for the largest clique, i.e. the state with less exclusions
229
								List<State> clique = returnBestClique(exclusions);
230
								// then merges the corresponding branches
231
								mergeBranches(clique,taxonStatesMap);
232
							}
233
						}
234
						if (taxonStatesMap!=null && !taxonStatesMap.isEmpty()) {
235
							for (Map.Entry<Set<TaxonDescription>,List<State>> entry : taxonStatesMap.entrySet()){
236
								Set<TaxonDescription> newTaxaCovered = entry.getKey();
237
								List<State> listOfStates = entry.getValue();
238
								if ((newTaxaCovered.size()>0) && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
239
									childrenExist = true;
240
									PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
241
									StringBuilder questionLabel = new StringBuilder();
242
									numberOfStates = listOfStates.size()-1;
243
									for (State state : listOfStates) {
244
										if (dependenciesON){
245
											// if the dependencies are considered, removes and adds the right features from/to the list of features left
246
											// these features are stored in order to be put back again when the current branch is finished
247
											if (iIdependencies.get(state)!= null) {
248
                                                innapplicables.addAll(iIdependencies.get(state));
249
                                            }
250
											if (oAIdependencies.get(state)!= null) {
251
                                                applicables.addAll(oAIdependencies.get(state));
252
                                            }
253
											for (Feature feature : innapplicables) {
254
                                                featuresLeft.remove(feature);
255
                                            }
256
											for (Feature feature : applicables) {
257
                                                featuresLeft.add(feature);
258
                                            }
259
										}
260
										questionLabel.append(state.getLabel());
261
										if (listOfStates.lastIndexOf(state)!=numberOfStates)
262
                                         {
263
                                            questionLabel.append(separator); // append a separator after each state except for the last one
264
                                        }
265
									}
266
									// old code used when PolytomousKey extended FeatureTree
267
									//									Representation question = new Representation(null, questionLabel.toString(),null, Language.DEFAULT());
268
									//									son.addQuestion(question);
269
									KeyStatement statement = KeyStatement.NewInstance(questionLabel.toString());
270
									son.setStatement(statement);
271
									son.setFeature(winnerFeature);
272
									father.addChild(son);
273
									featuresLeft.remove(winnerFeature); // unlike for quantitative features, once a categorical one has been used, it cannot be reused in the same branch
274
									boolean areTheTaxaDiscriminated;
275
									if (newTaxaCovered.size()==taxaCovered.size()) {
276
                                        areTheTaxaDiscriminated = true;
277
                                    } else {
278
                                        areTheTaxaDiscriminated = false;
279
                                    }
280
									buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
281
								}
282
							}
283
						}
284
					}
285
				}
286
			}
287
			// the features depending on other features are added/removed to/from the features left once the branch is done
288
			if (dependenciesON){
289
				for (Feature feature : innapplicables) {
290
                    featuresLeft.add(feature);
291
                }
292
				for (Feature feature : applicables) {
293
                    featuresLeft.remove(feature);
294
                }
295
			}
296
			// the winner features are put back to the features left once the branch is done
297
			featuresLeft.add(winnerFeature);
298
		}
299
		// if the node is a leaf, the statement contains the list of taxa to which the current leaf leads.
300
		if (!childrenExist){
301
			KeyStatement fatherStatement = father.getStatement();
302
			if (fatherStatement!=null){
303
				String statementString = fatherStatement.getLabelText(Language.DEFAULT());
304
				if (statementString !=null && taxaCovered != null){
305
					String label = statementString + " --> " + taxaCovered.toString();
306
					fatherStatement.putLabel(Language.DEFAULT(), label);
307
				}
308
			}
309
		}
310
	}
311

    
312

    
313

    
314
	/**
315
	 * Dependencies can be seen as a hierarchy.
316
	 * If the dependencies are on, this function calculates the score of all children features and give the highest score
317
	 * of these to their father (of course, if its score is lower). This way, if a feature has a good score but is
318
	 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
319
	 *
320
	 * @param scoreMap
321
	 * @param featuresLeft
322
	 * @param coveredTaxa
323
	 * @param quantitativeFeaturesThresholds
324
	 */
325
	private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
326
		// first copies the list of features left
327
		List<Feature> pseudoFeaturesLeft = featuresLeft.subList(0, featuresLeft.size()-1);
328
		// then adds all the features which depend on features contained in the list
329
		for (Feature feature : pseudoFeaturesLeft){
330
			if (featureDependencies.containsKey(feature)){
331
				for (Feature feature2 : featureDependencies.get(feature)){
332
					if (!pseudoFeaturesLeft.contains(feature2)){
333
						pseudoFeaturesLeft.add(feature2);
334
					}
335
				}
336
			}
337
		}
338
		// then calculates the scores of all features that have been added
339
		Map<Feature,Float> newScoreMap = featureScores(pseudoFeaturesLeft, coveredTaxa, quantitativeFeaturesThresholds);
340
		for (Feature feature : featureDependencies.keySet()){
341
			if (newScoreMap.containsKey(feature)){
342
				for (Feature feature2 : featureDependencies.get(feature)){
343
					if (newScoreMap.containsKey(feature2)) {
344
						// if a feature has a better than the "father" feature it depends on
345
						if (newScoreMap.get(feature2)>newScoreMap.get(feature)){
346
							// the "father" feature gets its score
347
							scoreMap.put(feature, newScoreMap.get(feature2));
348
						}
349
					}
350
				}
351
			}
352
		}
353
	}
354

    
355

    
356
	/**
357
	 * This function merges branches of the key belonging to the same clique
358
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
359
	 *
360
	 * @param clique the list of States linked together (i.e. if merged have the same score)
361
	 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
362
	 */
363
	private void mergeBranches(List<State> clique, Map<Set<TaxonDescription>, List<State>> taxonStatesMap){
364
		boolean stateFound;
365
		Map.Entry<Set<TaxonDescription>,List<State>> firstBranch=null;
366
		List<Set<TaxonDescription>> tdToDelete = new ArrayList<Set<TaxonDescription>>();
367

    
368
		if (clique.size()>1){
369
			Iterator<Map.Entry<Set<TaxonDescription>, List<State>>> it1 = taxonStatesMap.entrySet().iterator();
370
			while (it1.hasNext()){
371
				Map.Entry<Set<TaxonDescription>,List<State>> branch = it1.next();
372
				Iterator<State> stateIterator = clique.iterator();
373
				stateFound=false;
374
				// looks for one state of the clique in this branch
375
				while(stateIterator.hasNext() && stateFound!=true) {
376
					State state = stateIterator.next();
377
					if (branch.getValue().contains(state)) {
378
						stateFound=true;
379
					}
380
				}
381
				// if one state is found...
382
				if (stateFound==true){
383
					// ...for the first time, the branch becomes the one to which the others will be merged
384
					if (firstBranch==null){
385
						firstBranch=branch;
386
					}
387
					// ... else the branch is merged to the first one.
388
					else {
389
						firstBranch.getKey().addAll(branch.getKey());
390
						firstBranch.getValue().addAll(branch.getValue());
391
						tdToDelete.add(branch.getKey());
392
					}
393
				}
394
			}
395
			// once this is done, the branches merged to the first one are deleted
396
			for (Set<TaxonDescription> td : tdToDelete){
397
				taxonStatesMap.remove(td);
398
			}
399
		}
400
	}
401

    
402

    
403

    
404
	/**
405
	 * This function looks for the largest clique of States
406
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
407
	 * from the map of exclusions.
408
	 *
409
	 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
410
	 * @return
411
	 */
412
	private List<State> returnBestClique (Map<State,Set<State>> exclusions){
413
		int bestNumberOfExclusions=-1;;
414
		int numberOfExclusions;
415
		List<State> clique = new ArrayList<State>();
416

    
417
		// looks for the largest clique, i.e. the state with less exclusions
418
		State bestState=null;
419
		for (Iterator<Map.Entry<State,Set<State>>> it1 = exclusions.entrySet().iterator() ; it1.hasNext();){
420
			Map.Entry<State,Set<State>> pair = it1.next();
421
			numberOfExclusions = pair.getValue().size();
422
			if ((bestNumberOfExclusions==-1) || numberOfExclusions<bestNumberOfExclusions) {
423
				bestNumberOfExclusions=numberOfExclusions;
424
				bestState = pair.getKey();
425
			}
426
		}
427

    
428
		clique.add(bestState);
429
		exclusions.remove(bestState);
430

    
431
		boolean isNotExcluded;
432
		// then starts building the clique by adding the states which do not exclude each other
433
		for (Iterator<Map.Entry<State,Set<State>>> iterator = exclusions.entrySet().iterator() ; iterator.hasNext();){
434
			Map.Entry<State,Set<State>> pair = iterator.next();
435
			isNotExcluded = true;
436
			for (State state : clique) {
437
				if (pair.getValue().contains(state)) {
438
                    isNotExcluded = false;
439
                }
440
			}
441
			if (isNotExcluded){
442
				clique.add(pair.getKey());
443
			}
444
		}
445
		for (State state : clique) {
446
			exclusions.remove(state);
447
		}
448
		return clique;
449
	}
450

    
451

    
452
	/**
453
	 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
454
	 *
455
	 * @param statesDone the list of states already done for this feature
456
	 * @param categoricalData the element from which the states are extracted
457
	 * @param feature the feature corresponding to the CategoricalData
458
	 * @param taxaCovered the base of taxa considered
459
	 * @return
460
	 */
461
	private Map<Set<TaxonDescription>,List<State>> determineCategoricalStates(List<State> statesDone, CategoricalData categoricalData, Feature feature, Set<TaxonDescription> taxaCovered){
462
		Map<Set<TaxonDescription>,List<State>> childrenStatesMap = new HashMap<Set<TaxonDescription>,List<State>>();
463

    
464
		List<StateData> stateDatas = categoricalData.getStateData();
465

    
466
		List<State> states = new ArrayList<State>();
467
		for (StateData sd : stateDatas){
468
			states.add(sd.getState());
469
		}
470

    
471
		for (State featureState : states){ // for each state
472
			if(!statesDone.contains(featureState)){ // if the state hasn't already be considered
473
				statesDone.add(featureState);
474
				Set<TaxonDescription> newTaxaCovered = whichTaxa(feature,featureState,taxaCovered); //gets which taxa present this state
475
				List<State> newStates = childrenStatesMap.get(newTaxaCovered);
476
				if (newStates==null) { // if no states are associated to these taxa, create a new list
477
					newStates = new ArrayList<State>();
478
					childrenStatesMap.put(newTaxaCovered,newStates);
479
				}
480
				newStates.add(featureState); // then add the state to the list
481
			}
482
		}
483
		return childrenStatesMap;
484
	}
485

    
486

    
487
	/**
488
	 * returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
489
	 *
490
	 * @param feature
491
	 * @param featureState
492
	 * @param taxaCovered
493
	 * @return
494
	 */
495
	private Set<TaxonDescription> whichTaxa(Feature feature, State featureState, Set<TaxonDescription> taxaCovered){
496
		Set<TaxonDescription> newCoveredTaxa = new HashSet<TaxonDescription>();
497
		for (TaxonDescription td : taxaCovered){
498
			Set<DescriptionElementBase> elements = td.getElements();
499
			for (DescriptionElementBase deb : elements){
500
				if (deb.isInstanceOf(CategoricalData.class)) {
501
					if (deb.getFeature().equals(feature)) {
502
						List<StateData> stateDatas = ((CategoricalData)deb).getStateData();
503
						for (StateData sd : stateDatas) {
504
							if (sd.getState().equals(featureState)){
505
								newCoveredTaxa.add(td);
506
							}
507
						}
508
					}
509
				}
510
			}
511
		}
512
		return newCoveredTaxa;
513
	}
514

    
515

    
516
	/**
517
	 * This function returns the feature with the highest score. However, if several features have the same score
518
	 * the one wich leads to less options is chosen (this way, the key is easier to read).
519
	 *
520
	 * @param nTaxa
521
	 * @param scores
522
	 * @param taxaCovered
523
	 * @return
524
	 */
525
	private Feature lessStatesWinner(Map<Feature,Float> scores, Set<TaxonDescription> taxaCovered){
526
		int nTaxa = taxaCovered.size();
527
		if (nTaxa==1) {
528
            return null;
529
        }
530
		float meanScore = defaultMeanScore(nTaxa);
531
		float bestScore = nTaxa*nTaxa;
532
		List<Feature> bestFeatures = new ArrayList<Feature>(); // if ever different features have the best score, they are put in this list
533
		Feature bestFeature = null;
534
		Iterator<Map.Entry<Feature,Float>> it = scores.entrySet().iterator();
535
		float newScore;
536
		while (it.hasNext()){
537
			Map.Entry<Feature,Float> pair = it.next();
538
			if (pair.getValue()!=null){
539
				newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
540
				// a feature would have if it divided the taxa in two equal parts
541
				if (newScore < bestScore){
542
					bestFeatures.clear();
543
					bestFeatures.add(pair.getKey());
544
					bestScore = newScore;
545
				}else if (newScore==bestScore){
546
					bestFeatures.add(pair.getKey());
547
				}
548
			}
549
		}
550
		if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
551
			return bestFeatures.get(0);
552
		}
553
		else { // else choose the one with less states
554
			int lessStates=-1;
555
			int numberOfDifferentStates=-1;
556
			for (Feature feature : bestFeatures){
557
				if (feature.isSupportsCategoricalData()){
558
					Set<State> differentStates = new HashSet<State>();
559
					for (TaxonDescription td : taxaCovered){
560
						Set<DescriptionElementBase> elements = td.getElements();
561
						for (DescriptionElementBase deb : elements){
562
							if (deb.isInstanceOf(CategoricalData.class)) {
563
								CategoricalData catdat = (CategoricalData)deb;
564
								if (catdat.getFeature().equals(feature)) {
565
									List<StateData> stateDatas = catdat.getStateData();
566
									for (StateData sd : stateDatas) {
567
										differentStates.add(sd.getState());
568
									}
569
								}
570
							}
571
						}
572
					}
573
					numberOfDifferentStates=differentStates.size();
574
				}else if (feature.isSupportsQuantitativeData()){
575
					numberOfDifferentStates=2;
576
				}
577
				if (lessStates==-1 || numberOfDifferentStates<lessStates){
578
					lessStates=numberOfDifferentStates;
579
					bestFeature = feature;
580
				}
581
			}
582
			return bestFeature;
583
		}
584
	}
585

    
586

    
587
	/**
588
	 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
589
	 * would have.
590
	 *
591
	 * @param nTaxa
592
	 * @return
593
	 */
594
	private float defaultMeanScore(int nTaxa){
595
		int i;
596
		float score=0;
597
		for (i=1;i<nTaxa;i++){
598
			score = score + Math.round(i+1/2);
599
		}
600
		return score;
601
	}
602

    
603

    
604
	/**
605
	 * This function fills the map of features (keys) with their respecting scores (values)
606
	 *
607
	 * @param featuresLeft
608
	 * @param coveredTaxa
609
	 * @param quantitativeFeaturesThresholds
610
	 * @return
611
	 */
612
	private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
613
		Map<Feature,Float> scoreMap = new HashMap<Feature,Float>();
614
		for (Feature feature : featuresLeft){
615
			if (feature.isSupportsCategoricalData()) {
616
				scoreMap.put(feature, categoricalFeatureScore(feature,coveredTaxa));
617
			}
618
			if (feature.isSupportsQuantitativeData()){
619
				scoreMap.put(feature, quantitativeFeatureScore(feature,coveredTaxa, quantitativeFeaturesThresholds));
620
			}
621
		}
622
		return scoreMap;
623
	}
624

    
625
	/**
626
	 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
627
	 * for a given quantitative feature, present either a lower or higher value than a given threshold.
628
	 * It returns two Sets of TaxonDescription, one with the taxa under this threshold (taxaBefore) and another one
629
	 * with the taxa over (taxaAfter).
630
	 *
631
	 * @param threshold
632
	 * @param feature
633
	 * @param taxa
634
	 * @param unit
635
	 * @return
636
	 */
637
	private List<Set<TaxonDescription>> determineQuantitativeStates (Float threshold, Feature feature, Set<TaxonDescription> taxa, StringBuilder unit){
638
		List<Set<TaxonDescription>> list = new ArrayList<Set<TaxonDescription>>();
639
		Set<TaxonDescription> taxaBefore = new HashSet<TaxonDescription>();
640
		Set<TaxonDescription> taxaAfter = new HashSet<TaxonDescription>();
641
		list.add(taxaBefore);
642
		list.add(taxaAfter);
643
		for (TaxonDescription td : taxa){
644
			Set<DescriptionElementBase> elements = td.getElements();
645
			for (DescriptionElementBase deb : elements){
646
				if (deb.getFeature().equals(feature)) {
647
					if (deb.isInstanceOf(QuantitativeData.class)) {
648
						QuantitativeData qd = (QuantitativeData)deb;
649
						if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
650
							unit.append(" " + qd.getUnit().getLabel());
651
						}
652
						Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
653
						for (StatisticalMeasurementValue smv : values){
654
							StatisticalMeasure type = smv.getType();
655
							//TODO DONT FORGET sample size, MEAN etc
656
							if (type.equals(StatisticalMeasure.MAX()) || type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
657
								if (smv.getValue()>threshold){
658
									taxaAfter.add(td);
659
								}
660
							}
661
							if (type.equals(StatisticalMeasure.MIN()) || type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
662
								if (smv.getValue()<=threshold){
663
									taxaBefore.add(td);
664
								}
665
							}
666
						}
667
					}
668
				}
669
			}
670
		}
671
		return list;
672
	}
673

    
674

    
675

    
676
	/**
677
	 * This function returns the score of a quantitative feature.
678
	 *
679
	 * @param feature
680
	 * @param coveredTaxa
681
	 * @param quantitativeFeaturesThresholds
682
	 * @return
683
	 */
684
	private float quantitativeFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
685
		List<Float> allValues = new ArrayList<Float>();
686
		boolean lowerboundarypresent;
687
		boolean upperboundarypresent;
688
		float lowerboundary=0;
689
		float upperboundary=0;
690
		for (TaxonDescription td : coveredTaxa){
691
			Set<DescriptionElementBase> elements = td.getElements();
692
			for (DescriptionElementBase deb : elements){
693
				if (deb.getFeature().equals(feature)) {
694
					if (deb.isInstanceOf(QuantitativeData.class)) {
695
						QuantitativeData qd = (QuantitativeData)deb;
696
						Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
697
						lowerboundarypresent = false;
698
						upperboundarypresent = false;
699
						for (StatisticalMeasurementValue smv : values){
700
							StatisticalMeasure type = smv.getType();
701
							// DONT FORGET sample size, MEAN etc
702
							if (type.equals(StatisticalMeasure.MAX())) {
703
								upperboundary = smv.getValue();
704
								upperboundarypresent=true;
705
							}
706
							if (type.equals(StatisticalMeasure.MIN())) {
707
								lowerboundary = smv.getValue();
708
								lowerboundarypresent=true;
709
							}
710
							if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
711
								upperboundary = smv.getValue();
712
								upperboundarypresent=true;
713
							}
714
							if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
715
								lowerboundary = smv.getValue();
716
								lowerboundarypresent=true;
717
							}
718
							if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
719
								lowerboundary = smv.getValue();
720
								upperboundary = lowerboundary;
721
								lowerboundarypresent=true;
722
								upperboundarypresent=true;
723
							}
724
						}
725
						if (lowerboundarypresent && upperboundarypresent) {
726
							allValues.add(lowerboundary);
727
							allValues.add(upperboundary);
728
						}
729
					}
730
				}
731
			}
732
		}
733
		int i,j;
734
		float threshold=0;
735
		float bestThreshold=0;
736
		int difference=allValues.size();
737
		int differenceMin = difference;
738
		int taxaBefore=0;
739
		int taxaAfter=0;
740
		for (i=0;i<allValues.size()/2;i++) {
741
			threshold = allValues.get(i*2+1);
742
			taxaBefore=0;
743
			taxaAfter=0;
744
			for (j=0;j<allValues.size()/2;j++) {
745
				if (allValues.get(j*2+1)<=threshold){
746
					taxaBefore++;
747
				}
748
				if (allValues.get(j*2)>threshold){
749
					taxaAfter++;
750
				}
751
			}
752
			difference = Math.abs(taxaBefore-taxaAfter);
753
			if (difference<differenceMin){
754
				differenceMin=difference;
755
				bestThreshold = threshold;
756
			}
757
		}
758
		quantitativeFeaturesThresholds.put(feature, bestThreshold);
759
		int defaultQuantitativeScore=0;
760
		for (i=0;i<taxaBefore;i++) {
761
			defaultQuantitativeScore += taxaAfter - i;
762
		}
763
		return (defaultQuantitativeScore);
764
	}
765

    
766

    
767

    
768
	/**
769
	 * This function returns the score of a categorical feature.
770
	 *
771
	 * @param feature
772
	 * @param coveredTaxa
773
	 * @return
774
	 */
775
	private float categoricalFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa){
776
		int i,j;
777
		float score =0;
778
		float power=0;
779
		TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
780
		for (i=0 ; i<coveredTaxaArray.length; i++){
781
			Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
782
			DescriptionElementBase deb1 = null;
783
			for (DescriptionElementBase deb : elements1){
784
				if (deb.getFeature().equals(feature)){
785
					deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
786
				}
787
			}
788
			for (j=i+1 ; j< coveredTaxaArray.length ; j++){
789
				Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
790
				DescriptionElementBase deb2 = null;
791
				for (DescriptionElementBase deb : elements2){
792
					if (deb.getFeature().equals(feature)){
793
						deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
794
					}
795
				}
796
				power = defaultPower(deb1,deb2);
797
				score = score + power;
798
			}
799
		}
800
		return score;
801
	}
802

    
803

    
804
	/**
805
	 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
806
	 *
807
	 * @param node
808
	 */
809
	private void checkDependencies(FeatureNode<Feature> node){
810
		if (node.getOnlyApplicableIf()!=null){
811
			Set<State> addToOAI = node.getOnlyApplicableIf();
812
			for (State state : addToOAI){
813
				if (oAIdependencies.containsKey(state)) {
814
                    oAIdependencies.put(state, new HashSet<Feature>());
815
                }
816
				oAIdependencies.get(state).add(node.getTerm());
817
			}
818
		}
819
		if (node.getInapplicableIf()!=null){
820
			Set<State> addToiI = node.getInapplicableIf();
821
			for (State state : addToiI){
822
				if (iIdependencies.containsKey(state)) {
823
                    iIdependencies.put(state, new HashSet<Feature>());
824
                }
825
				iIdependencies.get(state).add(node.getTerm());
826
			}
827
		}
828
		if (node.getChildNodes()!=null) {
829
			for (FeatureNode fn : node.getChildNodes()){
830
				checkDependencies(fn);
831
			}
832
		}
833
	}
834

    
835

    
836

    
837
	/**
838
	 * This function fills the exclusions map.
839
	 *
840
	 * @param feature
841
	 * @param coveredTaxa
842
	 * @param exclusions
843
	 * @return
844
	 */
845
	private float featureScoreAndMerge(Feature feature, Set<TaxonDescription> coveredTaxa, Map<State,Set<State>> exclusions){
846
		int i,j;
847
		float score =0;
848
		float power=0;
849
		TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
850
		for (i=0 ; i<coveredTaxaArray.length; i++){
851
			Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
852
			DescriptionElementBase deb1 = null;
853
			for (DescriptionElementBase deb : elements1){
854
				if (deb.getFeature().equals(feature)){
855
					deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
856
				}
857
			}
858
			for (j=i+1 ; j< coveredTaxaArray.length ; j++){
859
				Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
860
				DescriptionElementBase deb2 = null;
861
				for (DescriptionElementBase deb : elements2){
862
					if (deb.getFeature().equals(feature)){
863
						deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
864
					}
865
				}
866
				power = defaultPower(deb1,deb2);
867
				score = score + power;
868
				if (power>0) // if there is no state in common between deb1 and deb2
869
				{
870
					CategoricalData cat1 = (CategoricalData)deb1;
871
					CategoricalData cat2 = (CategoricalData)deb2;
872
					for (StateData statedata1 : cat1.getStateData()){
873
						State state1 = statedata1.getState();
874
						if (!exclusions.containsKey(state1)){
875
							exclusions.put(state1, new HashSet<State>());
876
						}
877
						for (StateData statedata2 : cat2.getStateData()){
878
							State state2 = statedata2.getState();
879
							if (!exclusions.containsKey(state2)){
880
								exclusions.put(state2, new HashSet<State>());
881
							}
882
							exclusions.get(state1).add(state2);
883
							exclusions.get(state2).add(state1);
884
						}
885
					}
886
				}
887
			}
888
		}
889
		return score;
890
	}
891

    
892

    
893

    
894
	/**
895
	 * Returns the score between two DescriptionElementBase. If one of them is null, returns -1.
896
	 * If they are not of the same type (Categorical) returns 0.
897
	 *
898
	 * @param deb1
899
	 * @param deb2
900
	 * @return
901
	 */
902
	private float defaultPower(DescriptionElementBase deb1, DescriptionElementBase deb2){
903
		if (deb1==null || deb2==null) {
904
			return -1; //what if the two taxa don't have this feature in common ?
905
		}
906
		if ((deb1.isInstanceOf(CategoricalData.class))&&(deb2.isInstanceOf(CategoricalData.class))) {
907
			return defaultCategoricalPower((CategoricalData)deb1, (CategoricalData)deb2);
908
		} else {
909
            return 0;
910
        }
911
	}
912

    
913
	/**
914
	 * Returns the score of a categorical feature.
915
	 *
916
	 * @param deb1
917
	 * @param deb2
918
	 * @return
919
	 */
920
	private float defaultCategoricalPower(CategoricalData deb1, CategoricalData deb2){
921
		List<StateData> states1 = deb1.getStateData();
922
		List<StateData> states2 = deb2.getStateData();
923
		boolean bool = false;
924
		Iterator<StateData> stateData1Iterator = states1.iterator() ;
925
		//		while (!bool && stateData1Iterator.hasNext()) {
926
		//			Iterator<StateData> stateData2Iterator = states2.iterator() ;
927
		//			StateData stateData1 = stateData1Iterator.next();
928
		//			while (!bool && stateData2Iterator.hasNext()) {
929
		//				bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
930
		//			}
931
		//		}
932
		// one point each time two taxa can be discriminated for a given feature
933

    
934
		boolean checkFeature = false;
935

    
936
		if (!featureDependencies.containsKey(deb1.getFeature())){
937
			featureDependencies.put(deb1.getFeature(), new HashSet<Feature>());
938
			checkFeature = true;
939
		}
940

    
941
		while (stateData1Iterator.hasNext()) {
942
			Iterator<StateData> stateData2Iterator = states2.iterator() ;
943
			StateData stateData1 = stateData1Iterator.next();
944
			State state1 = stateData1.getState();
945
			if (checkFeature){
946
				if (iIdependencies.get(state1)!=null) {
947
					featureDependencies.get(deb1.getFeature()).addAll(iIdependencies.get(state1));
948
				}
949
				if (oAIdependencies.get(state1)!=null) {
950
					featureDependencies.get(deb1.getFeature()).addAll(oAIdependencies.get(state1));
951
				}
952
			}
953
			while (stateData2Iterator.hasNext()) {
954
				State state2 = stateData1.getState();
955
				bool = bool || state1.equals(state2); // checks if the states are the same
956
			}
957
		}
958

    
959

    
960
		if (bool) {
961
            return 0;
962
        } else {
963
            return 1;
964
        }
965
	}
966

    
967
	// old code used when PolytomousKey extended FeatureTree
968
	//	private void printTree(List<PolytomousKeyNode> fnodes, String spaces){
969
	//		if (!fnodes.isEmpty()){
970
	//			level++;
971
	//			int levelcopy = level;
972
	//			int j=1;
973
	//			String delimiter;
974
	//			String equals = " = ";
975
	//			String quantitative = "";
976
	//			String newspaces = spaces.concat("\t");
977
	//			for (PolytomousKeyNode fnode : fnodes){
978
	//				if (fnode.getFeature()!=null) {
979
	//					String state = null;
980
	//					if (fnode.getQuestion(Language.DEFAULT())!=null) state = fnode.getQuestion(Language.DEFAULT()).getLabel();
981
	//					if (fnode.getFeature().isSupportsQuantitativeData()) delimiter = quantitative;
982
	//					else delimiter = equals;
983
	//					System.out.println(newspaces + levelcopy + " : " + j + " " + fnode.getFeature().getLabel() + delimiter + state);
984
	//					j++;
985
	//				}
986
	//				else { // TODO never read ?
987
	//					if (fnode.getQuestion(Language.DEFAULT())!=null) System.out.println(newspaces + "-> " + fnode.getQuestion(Language.DEFAULT()).getLabel());
988
	//				}
989
	//				printTree(fnode.getChildren(),newspaces);
990
	//			}
991
	//		}
992
	//	}
993

    
994
}
995

    
    (1-1/1)