Project

General

Profile

Download (36.8 KB) Statistics
| Branch: | Tag: | Revision:
1
package eu.etaxonomy.cdm.strategy.generate;
2

    
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.HashSet;
6
import java.util.Iterator;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.Set;
10

    
11
import eu.etaxonomy.cdm.model.common.Language;
12
import eu.etaxonomy.cdm.model.description.CategoricalData;
13
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
14
import eu.etaxonomy.cdm.model.description.Feature;
15
import eu.etaxonomy.cdm.model.description.FeatureNode;
16
import eu.etaxonomy.cdm.model.description.FeatureTree;
17
import eu.etaxonomy.cdm.model.description.KeyStatement;
18
import eu.etaxonomy.cdm.model.description.PolytomousKey;
19
import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
20
import eu.etaxonomy.cdm.model.description.QuantitativeData;
21
import eu.etaxonomy.cdm.model.description.State;
22
import eu.etaxonomy.cdm.model.description.StateData;
23
import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
24
import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
25
import eu.etaxonomy.cdm.model.description.TaxonDescription;
26

    
27
/**
28
 * @author m.venin
29
 *
30
 */
31
public class PolytomousKeyGenerator {
32

    
33
	static int level=-1; // global variable needed by the printTree function in order to store the level which is being printed
34
	private PolytomousKey polytomousKey; // the Identification Key
35
	private List<Feature> features; // the features used to generate the key
36
	private Set<TaxonDescription> taxa; // the base of taxa
37

    
38
	private boolean merge=true; // if this boolean is set to true, branches of the tree will be merged if the corresponding states can be used together without decreasing their score 
39

    
40
	private FeatureTree dependenciesTree; // the tree containing the dependencies between states and features (InapplicableIf and OnlyApplicableIf)
41
	private Map<State,Set<Feature>> iIdependencies = new HashMap<State,Set<Feature>>(); // map of a set of Features (value) inapplicables if a State (key) is present 
42
	private Map<State,Set<Feature>> oAIdependencies = new HashMap<State,Set<Feature>>(); // map of a set of Features (value) only applicables if a State (key) is present
43
	private Map<Feature,Set<Feature>> featureDependencies = new HashMap<Feature,Set<Feature>>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
44
	private boolean dependenciesON = true; // if this boolean is true, the dependencies are taken into account
45

    
46

    
47
	/**
48
	 * These strings are used for generating the statements of the key.
49
	 */
50
	private String before="<";
51
	private String after=">";
52
	private String separator = " or ";
53

    
54
	/**
55
	 * Sets the features used to generate the key.
56
	 * 
57
	 * @param featuresList
58
	 */
59
	public void setFeatures(List<Feature> featuresList){
60
		this.features = featuresList;
61
	}
62

    
63
	/**
64
	 * Sets the base of taxa.
65
	 * 
66
	 * @param featuresList
67
	 */
68
	public void setTaxa(Set<TaxonDescription> taxaSet){
69
		this.taxa = taxaSet;
70
	}
71

    
72
	/**
73
	 * Sets the tree containing the dependencies between states and features.
74
	 * 
75
	 * @param tree
76
	 */
77
	public void setDependencies(FeatureTree tree){
78
		this.dependenciesTree = tree;
79
	}
80

    
81
	/**
82
	 * Allows the generator to use the dependencies given by the function "setDependencies".
83
	 */
84
	public void turnDependenciesON(){
85
		this.dependenciesON = true;
86
	}
87

    
88
	/**
89
	 * Prevent the generator from using dependencies.
90
	 */
91
	public void turnDependenciesOFF(){
92
		this.dependenciesON = false;
93
	}
94

    
95
	/**
96
	 * Allows the generator to merge branches if the corresponding states can be used together without diminishing their score.
97
	 */
98
	public void mergeModeON(){
99
		this.merge = true;
100
	}
101

    
102
	/**
103
	 * Prevent the generator from merging branches (apart from those leading to the same set of taxa).
104
	 */
105
	public void mergeModeOFF(){
106
		this.merge = false;
107
	}
108

    
109

    
110
	/**
111
	 * Initializes the function buildBranches() with the starting parameters in order to build the key 
112
	 */
113
	private void loop(){
114
		polytomousKey = PolytomousKey.NewInstance();
115
		PolytomousKeyNode root = polytomousKey.getRoot();
116
		buildBranches(root,features,taxa,true);
117
	}
118

    
119

    
120
	/**
121
	 * Creates the key and prints it
122
	 */
123
	public PolytomousKey invoke(){
124
		if (dependenciesON && dependenciesTree!=null){
125
			checkDependencies(dependenciesTree.getRoot());
126
		}
127
		loop();
128
		List<PolytomousKeyNode> rootlist = new ArrayList<PolytomousKeyNode>();
129
		rootlist.add(polytomousKey.getRoot());
130
		//		String spaces = new String();
131
		//		printTree(rootlist,spaces);
132
		//		System.out.println(paths.toString());
133
		return polytomousKey;
134
	}
135

    
136

    
137
	/**
138
	 * Recursive function that builds the branches of the identification key (FeatureTree)
139
	 * 
140
	 * @param father the node considered
141
	 * @param featuresLeft List of features that can be used at this point
142
	 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
143
	 * @param taxaAreTheSame if in the previous level the taxa discriminated are the same, this boolean is set to true, thus if the taxa, again, are not discriminated at this level the function stops
144
	 */
145
	private void buildBranches(PolytomousKeyNode father, List<Feature> featuresLeft, Set<TaxonDescription> taxaCovered, boolean taxaDiscriminated){
146
		boolean childrenExist = false; // boolean indicating if this node has children, if not, it means we reached a leaf and instead of adding a
147
		// question or statement to it, we must add the list of taxa discriminated at this point
148

    
149
		// These sets respectively store the features inapplicables and applicables at this point in the key
150
		Set<Feature> innapplicables = new HashSet<Feature>();
151
		Set<Feature> applicables = new HashSet<Feature>();
152

    
153
		if (taxaCovered.size()>1) {
154
			// this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
155
			Map<Feature,Float> quantitativeFeaturesThresholds = new HashMap<Feature,Float>();
156
			// the scores of the different features are calculated, the thresholds in the same time
157
			Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
158
			dependenciesScores(scoreMap,featuresLeft, taxaCovered, quantitativeFeaturesThresholds);
159
			// the feature with the best score becomes the one corresponding to the current node
160
			Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
161
			// the feature is removed from the list of features available to build the next level of the tree
162
			featuresLeft.remove(winnerFeature);
163

    
164
			int i;
165
			/************** either the feature supports quantitative data... **************/
166
			// NB: in this version, "quantitative features" are dealt with in a dichotomous way
167
			if (winnerFeature!=null && winnerFeature.isSupportsQuantitativeData()) {
168
				// first, get the threshold
169
				float threshold = quantitativeFeaturesThresholds.get(winnerFeature);
170
				String sign;
171
				StringBuilder unit= new StringBuilder("");
172
				// then determine which taxa are before and which are after this threshold (dichotomy) in order to create the children of the father node
173
				List<Set<TaxonDescription>> quantitativeStates = determineQuantitativeStates(threshold,winnerFeature,taxaCovered,unit);
174
				// thus the list contains two sets of TaxonDescription, the first corresponding to those before, the second to those after the threshold
175
				for (i=0;i<2;i++) {
176
					Set<TaxonDescription> newTaxaCovered = quantitativeStates.get(i);
177
					if (i==0){
178
						sign = before; // the first element of the list corresponds to taxa before the threshold
179
					} else {
180
						sign = after; // the second to those after
181
					}
182
					if (newTaxaCovered.size()>0 && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
183
						childrenExist = true;
184
						PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
185
						son.setFeature(winnerFeature);
186
						// old code used when PolytomousKey extended FeatureTree
187
						//						Representation question = new Representation(null, " " + sign + " " + threshold +unit,null, Language.DEFAULT()); // the question attribute is used to store the state of the feature
188
						//						son.addQuestion(question);
189
						KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold +unit); // the question attribute is used to store the state of the feature
190
						son.setStatement(statement);
191
						father.addChild(son);
192
						boolean areTheTaxaDiscriminated;
193
						if (newTaxaCovered.size()==taxaCovered.size()) areTheTaxaDiscriminated = false;
194
						else areTheTaxaDiscriminated = true;
195
						buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
196
					}
197
				}
198
			}
199

    
200
			/************** ...or it supports categorical data. **************/
201
			// "categorical features" may present several different states, each one of these might correspond to one child
202
			List<State> statesDone = new ArrayList<State>(); // the list of states already used
203
			int numberOfStates;
204
			if (winnerFeature!=null && winnerFeature.isSupportsCategoricalData()) {
205
				for (TaxonDescription td : taxaCovered){
206
					DescriptionElementBase debConcerned = null;
207

    
208
					// first, get the right DescriptionElementBase
209
					for (DescriptionElementBase deb : td.getElements()) {
210
						if (deb.getFeature().equals(winnerFeature)) debConcerned = deb;
211
					}
212

    
213
					if (debConcerned!=null) {
214
						// a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
215
						Map<Set<TaxonDescription>,List<State>> taxonStatesMap = determineCategoricalStates(statesDone,(CategoricalData)debConcerned,winnerFeature,taxaCovered);
216
						// if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
217
						if (merge){
218
							// creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
219
							Map<State,Set<State>> exclusions = new HashMap<State,Set<State>>();
220
							featureScoreAndMerge(winnerFeature,taxaCovered,exclusions);
221

    
222
							while (!exclusions.isEmpty()){
223
								// looks for the largest clique, i.e. the state with less exclusions
224
								List<State> clique = returnBestClique(exclusions);
225
								// then merges the corresponding branches
226
								mergeBranches(clique,taxonStatesMap);
227
							}
228
						}
229
						if (taxonStatesMap!=null && !taxonStatesMap.isEmpty()) { 
230
							for (Map.Entry<Set<TaxonDescription>,List<State>> entry : taxonStatesMap.entrySet()){
231
								Set<TaxonDescription> newTaxaCovered = entry.getKey();
232
								List<State> listOfStates = entry.getValue();
233
								if ((newTaxaCovered.size()>0) && !((newTaxaCovered.size()==taxaCovered.size()) && !taxaDiscriminated)){ // if the taxa are discriminated compared to those of the father node, a child is created
234
									childrenExist = true;
235
									PolytomousKeyNode son = PolytomousKeyNode.NewInstance();
236
									StringBuilder questionLabel = new StringBuilder();
237
									numberOfStates = listOfStates.size()-1;
238
									for (State state : listOfStates) {
239
										if (dependenciesON){
240
											// if the dependencies are considered, removes and adds the right features from/to the list of features left
241
											// these features are stored in order to be put back again when the current branch is finished
242
											if (iIdependencies.get(state)!= null) innapplicables.addAll(iIdependencies.get(state));
243
											if (oAIdependencies.get(state)!= null) applicables.addAll(oAIdependencies.get(state));
244
											for (Feature feature : innapplicables) featuresLeft.remove(feature);
245
											for (Feature feature : applicables) featuresLeft.add(feature);
246
										}
247
										questionLabel.append(state.getLabel());
248
										if (listOfStates.lastIndexOf(state)!=numberOfStates) questionLabel.append(separator); // append a separator after each state except for the last one
249
									}
250
									// old code used when PolytomousKey extended FeatureTree
251
									//									Representation question = new Representation(null, questionLabel.toString(),null, Language.DEFAULT());
252
									//									son.addQuestion(question);
253
									KeyStatement statement = KeyStatement.NewInstance(questionLabel.toString());
254
									son.setStatement(statement);
255
									son.setFeature(winnerFeature);
256
									father.addChild(son);
257
									featuresLeft.remove(winnerFeature); // unlike for quantitative features, once a categorical one has been used, it cannot be reused in the same branch
258
									boolean areTheTaxaDiscriminated;
259
									if (newTaxaCovered.size()==taxaCovered.size()) areTheTaxaDiscriminated = true;
260
									else areTheTaxaDiscriminated = false;
261
									buildBranches(son,featuresLeft, newTaxaCovered,areTheTaxaDiscriminated);
262
								}
263
							}
264
						}
265
					}
266
				}
267
			}
268
			// the features depending on other features are added/removed to/from the features left once the branch is done
269
			if (dependenciesON){
270
				for (Feature feature : innapplicables) featuresLeft.add(feature);
271
				for (Feature feature : applicables) featuresLeft.remove(feature);
272
			}
273
			// the winner features are put back to the features left once the branch is done
274
			featuresLeft.add(winnerFeature);
275
		}
276
		// if the node is a leaf, the statement contains the list of taxa to which the current leaf leads.
277
		if (!childrenExist){
278
			KeyStatement fatherStatement = father.getStatement();
279
			if (fatherStatement!=null){
280
				String statementString = fatherStatement.getLabelText(Language.DEFAULT());
281
				if (statementString !=null && taxaCovered != null){
282
					String label = statementString + " --> " + taxaCovered.toString();
283
					fatherStatement.putLabel(Language.DEFAULT(), label);
284
				}
285
			}
286
		}
287
	}
288

    
289

    
290

    
291
	/**
292
	 * Dependencies can be seen as a hierarchy.
293
	 * If the dependencies are on, this function calculates the score of all children features and give the highest score
294
	 * of these to their father (of course, if its score is lower). This way, if a feature has a good score but is
295
	 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
296
	 * 
297
	 * @param scoreMap
298
	 * @param featuresLeft
299
	 * @param coveredTaxa
300
	 * @param quantitativeFeaturesThresholds
301
	 */
302
	private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
303
		// first copies the list of features left
304
		List<Feature> pseudoFeaturesLeft = featuresLeft.subList(0, featuresLeft.size()-1);
305
		// then adds all the features which depend on features contained in the list
306
		for (Feature feature : pseudoFeaturesLeft){
307
			if (featureDependencies.containsKey(feature)){
308
				for (Feature feature2 : featureDependencies.get(feature)){
309
					if (!pseudoFeaturesLeft.contains(feature2)){
310
						pseudoFeaturesLeft.add(feature2);
311
					}
312
				}
313
			}
314
		}
315
		// then calculates the scores of all features that have been added
316
		Map<Feature,Float> newScoreMap = featureScores(pseudoFeaturesLeft, coveredTaxa, quantitativeFeaturesThresholds);
317
		for (Feature feature : featureDependencies.keySet()){
318
			if (newScoreMap.containsKey(feature)){
319
				for (Feature feature2 : featureDependencies.get(feature)){
320
					if (newScoreMap.containsKey(feature2)) {
321
						// if a feature has a better than the "father" feature it depends on
322
						if (newScoreMap.get(feature2)>newScoreMap.get(feature)){
323
							// the "father" feature gets its score
324
							scoreMap.put(feature, newScoreMap.get(feature2));
325
						}
326
					}
327
				}
328
			}
329
		}
330
	}
331

    
332

    
333
	/**
334
	 * This function merges branches of the key belonging to the same clique
335
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
336
	 * 
337
	 * @param clique the list of States linked together (i.e. if merged have the same score)
338
	 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
339
	 */
340
	private void mergeBranches(List<State> clique, Map<Set<TaxonDescription>, List<State>> taxonStatesMap){
341
		boolean stateFound;
342
		Map.Entry<Set<TaxonDescription>,List<State>> firstBranch=null;
343
		List<Set<TaxonDescription>> tdToDelete = new ArrayList<Set<TaxonDescription>>();
344

    
345
		if (clique.size()>1){
346
			Iterator<Map.Entry<Set<TaxonDescription>, List<State>>> it1 = taxonStatesMap.entrySet().iterator();
347
			while (it1.hasNext()){
348
				Map.Entry<Set<TaxonDescription>,List<State>> branch = (Map.Entry<Set<TaxonDescription>, List<State>>)it1.next();
349
				Iterator<State> stateIterator = clique.iterator();
350
				stateFound=false;
351
				// looks for one state of the clique in this branch
352
				while(stateIterator.hasNext() && stateFound!=true) {
353
					State state = stateIterator.next();
354
					if (branch.getValue().contains(state)) {
355
						stateFound=true;
356
					}
357
				}
358
				// if one state is found...
359
				if (stateFound==true){
360
					// ...for the first time, the branch becomes the one to which the others will be merged
361
					if (firstBranch==null){
362
						firstBranch=branch;
363
					}
364
					// ... else the branch is merged to the first one.
365
					else {
366
						firstBranch.getKey().addAll(branch.getKey());
367
						firstBranch.getValue().addAll(branch.getValue());
368
						tdToDelete.add(branch.getKey());
369
					}
370
				}
371
			}
372
			// once this is done, the branches merged to the first one are deleted
373
			for (Set<TaxonDescription> td : tdToDelete){
374
				taxonStatesMap.remove(td);
375
			}
376
		}
377
	}
378

    
379

    
380

    
381
	/**
382
	 * This function looks for the largest clique of States
383
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
384
	 * from the map of exclusions.
385
	 * 
386
	 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
387
	 * @return
388
	 */
389
	private List<State> returnBestClique (Map<State,Set<State>> exclusions){
390
		int bestNumberOfExclusions=-1;;
391
		int numberOfExclusions;
392
		List<State> clique = new ArrayList<State>();
393

    
394
		// looks for the largest clique, i.e. the state with less exclusions
395
		State bestState=null;
396
		for (Iterator<Map.Entry<State,Set<State>>> it1 = exclusions.entrySet().iterator() ; it1.hasNext();){
397
			Map.Entry<State,Set<State>> pair = (Map.Entry<State,Set<State>>)it1.next();
398
			numberOfExclusions = pair.getValue().size();
399
			if ((bestNumberOfExclusions==-1) || numberOfExclusions<bestNumberOfExclusions) {
400
				bestNumberOfExclusions=numberOfExclusions;
401
				bestState = pair.getKey();
402
			}
403
		}
404

    
405
		clique.add(bestState);
406
		exclusions.remove(bestState);
407

    
408
		boolean isNotExcluded;
409
		// then starts building the clique by adding the states which do not exclude each other
410
		for (Iterator<Map.Entry<State,Set<State>>> iterator = exclusions.entrySet().iterator() ; iterator.hasNext();){
411
			Map.Entry<State,Set<State>> pair = (Map.Entry<State,Set<State>>)iterator.next();
412
			isNotExcluded = true;
413
			for (State state : clique) {
414
				if (pair.getValue().contains(state)) isNotExcluded = false;
415
			}
416
			if (isNotExcluded){
417
				clique.add(pair.getKey());
418
			}
419
		}
420
		for (State state : clique) {
421
			exclusions.remove(state);
422
		}
423
		return clique;
424
	}
425

    
426

    
427
	/**
428
	 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
429
	 * 
430
	 * @param statesDone the list of states already done for this feature
431
	 * @param categoricalData the element from which the states are extracted
432
	 * @param feature the feature corresponding to the CategoricalData
433
	 * @param taxaCovered the base of taxa considered
434
	 * @return
435
	 */
436
	private Map<Set<TaxonDescription>,List<State>> determineCategoricalStates(List<State> statesDone, CategoricalData categoricalData, Feature feature, Set<TaxonDescription> taxaCovered){
437
		Map<Set<TaxonDescription>,List<State>> childrenStatesMap = new HashMap<Set<TaxonDescription>,List<State>>();
438

    
439
		List<StateData> stateDatas = categoricalData.getStates();
440

    
441
		List<State> states = new ArrayList<State>();
442
		for (StateData sd : stateDatas){
443
			states.add(sd.getState());
444
		}
445

    
446
		for (State featureState : states){ // for each state
447
			if(!statesDone.contains(featureState)){ // if the state hasn't already be considered
448
				statesDone.add(featureState);
449
				Set<TaxonDescription> newTaxaCovered = whichTaxa(feature,featureState,taxaCovered); //gets which taxa present this state
450
				List<State> newStates = childrenStatesMap.get(newTaxaCovered);
451
				if (newStates==null) { // if no states are associated to these taxa, create a new list
452
					newStates = new ArrayList<State>();
453
					childrenStatesMap.put(newTaxaCovered,newStates);
454
				}
455
				newStates.add(featureState); // then add the state to the list
456
			}
457
		}
458
		return childrenStatesMap;
459
	}
460

    
461

    
462
	/**
463
	 * returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
464
	 * 
465
	 * @param feature
466
	 * @param featureState
467
	 * @param taxaCovered
468
	 * @return
469
	 */
470
	private Set<TaxonDescription> whichTaxa(Feature feature, State featureState, Set<TaxonDescription> taxaCovered){
471
		Set<TaxonDescription> newCoveredTaxa = new HashSet<TaxonDescription>();
472
		for (TaxonDescription td : taxaCovered){
473
			Set<DescriptionElementBase> elements = td.getElements();
474
			for (DescriptionElementBase deb : elements){
475
				if (deb.isInstanceOf(CategoricalData.class)) {
476
					if (deb.getFeature().equals(feature)) {
477
						List<StateData> stateDatas = ((CategoricalData)deb).getStates();
478
						for (StateData sd : stateDatas) {
479
							if (sd.getState().equals(featureState)){
480
								newCoveredTaxa.add(td);
481
							}
482
						}
483
					}
484
				} 
485
			}
486
		}
487
		return newCoveredTaxa;
488
	}
489

    
490

    
491
	/**
492
	 * This function returns the feature with the highest score. However, if several features have the same score
493
	 * the one wich leads to less options is chosen (this way, the key is easier to read).
494
	 * 
495
	 * @param nTaxa
496
	 * @param scores
497
	 * @param taxaCovered
498
	 * @return
499
	 */
500
	private Feature lessStatesWinner(Map<Feature,Float> scores, Set<TaxonDescription> taxaCovered){
501
		int nTaxa = taxaCovered.size();
502
		if (nTaxa==1) return null;
503
		float meanScore = defaultMeanScore(nTaxa); 
504
		float bestScore = nTaxa*nTaxa;
505
		List<Feature> bestFeatures = new ArrayList<Feature>(); // if ever different features have the best score, they are put in this list
506
		Feature bestFeature = null;
507
		Iterator<Map.Entry<Feature,Float>> it = scores.entrySet().iterator();
508
		float newScore;
509
		while (it.hasNext()){
510
			Map.Entry<Feature,Float> pair = (Map.Entry<Feature,Float>)it.next();
511
			if (pair.getValue()!=null){
512
				newScore = Math.abs((Float)pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
513
				// a feature would have if it divided the taxa in two equal parts
514
				if (newScore < bestScore){
515
					bestFeatures.clear();
516
					bestFeatures.add((Feature)pair.getKey());
517
					bestScore = newScore;
518
				}else if (newScore==bestScore){
519
					bestFeatures.add((Feature)pair.getKey());
520
				}
521
			}
522
		}
523
		if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
524
			return bestFeatures.get(0);
525
		}
526
		else { // else choose the one with less states
527
			int lessStates=-1;
528
			int numberOfDifferentStates=-1;
529
			for (Feature feature : bestFeatures){
530
				if (feature.isSupportsCategoricalData()){
531
					Set<State> differentStates = new HashSet<State>();
532
					for (TaxonDescription td : taxaCovered){
533
						Set<DescriptionElementBase> elements = td.getElements();
534
						for (DescriptionElementBase deb : elements){
535
							if (deb.isInstanceOf(CategoricalData.class)) {
536
								CategoricalData catdat = (CategoricalData)deb;
537
								if (catdat.getFeature().equals(feature)) {
538
									List<StateData> stateDatas = catdat.getStates();
539
									for (StateData sd : stateDatas) {
540
										differentStates.add(sd.getState());
541
									}
542
								}
543
							}
544
						} 
545
					}
546
					numberOfDifferentStates=differentStates.size();
547
				}else if (feature.isSupportsQuantitativeData()){
548
					numberOfDifferentStates=2;
549
				}
550
				if (lessStates==-1 || numberOfDifferentStates<lessStates){
551
					lessStates=numberOfDifferentStates;
552
					bestFeature = feature;
553
				}
554
			}
555
			return bestFeature;
556
		}
557
	}
558

    
559

    
560
	/**
561
	 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
562
	 * would have.
563
	 * 
564
	 * @param nTaxa
565
	 * @return
566
	 */
567
	private float defaultMeanScore(int nTaxa){
568
		int i;
569
		float score=0;
570
		for (i=1;i<nTaxa;i++){
571
			score = score + Math.round((float)(i+1/2));
572
		}
573
		return score;
574
	}
575

    
576

    
577
	/**
578
	 * This function fills the map of features (keys) with their respecting scores (values)
579
	 * 
580
	 * @param featuresLeft
581
	 * @param coveredTaxa
582
	 * @param quantitativeFeaturesThresholds
583
	 * @return
584
	 */
585
	private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
586
		Map<Feature,Float> scoreMap = new HashMap<Feature,Float>();
587
		for (Feature feature : featuresLeft){
588
			if (feature.isSupportsCategoricalData()) {
589
				scoreMap.put(feature, categoricalFeatureScore(feature,coveredTaxa));
590
			}
591
			if (feature.isSupportsQuantitativeData()){
592
				scoreMap.put(feature, quantitativeFeatureScore(feature,coveredTaxa, quantitativeFeaturesThresholds));
593
			}
594
		}
595
		return scoreMap;
596
	}
597

    
598
	/**
599
	 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
600
	 * for a given quantitative feature, present either a lower or higher value than a given threshold.
601
	 * It returns two Sets of TaxonDescription, one with the taxa under this threshold (taxaBefore) and another one
602
	 * with the taxa over (taxaAfter).
603
	 * 
604
	 * @param threshold
605
	 * @param feature
606
	 * @param taxa
607
	 * @param unit
608
	 * @return
609
	 */
610
	private List<Set<TaxonDescription>> determineQuantitativeStates (Float threshold, Feature feature, Set<TaxonDescription> taxa, StringBuilder unit){
611
		List<Set<TaxonDescription>> list = new ArrayList<Set<TaxonDescription>>();
612
		Set<TaxonDescription> taxaBefore = new HashSet<TaxonDescription>();
613
		Set<TaxonDescription> taxaAfter = new HashSet<TaxonDescription>();
614
		list.add(taxaBefore);
615
		list.add(taxaAfter);
616
		for (TaxonDescription td : taxa){
617
			Set<DescriptionElementBase> elements = td.getElements();
618
			for (DescriptionElementBase deb : elements){
619
				if (deb.getFeature().equals(feature)) {
620
					if (deb.isInstanceOf(QuantitativeData.class)) {
621
						QuantitativeData qd = (QuantitativeData)deb;
622
						if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
623
							unit.append(" " + qd.getUnit().getLabel());
624
						}
625
						Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
626
						for (StatisticalMeasurementValue smv : values){
627
							StatisticalMeasure type = smv.getType();
628
							//TODO DONT FORGET sample size, MEAN etc
629
							if (type.equals(StatisticalMeasure.MAX()) || type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
630
								if (smv.getValue()>threshold){
631
									taxaAfter.add(td);
632
								}
633
							}
634
							if (type.equals(StatisticalMeasure.MIN()) || type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) || type.equals(StatisticalMeasure.AVERAGE())) {
635
								if (smv.getValue()<=threshold){
636
									taxaBefore.add(td);
637
								}
638
							}
639
						}
640
					}
641
				}
642
			}
643
		}
644
		return list;
645
	}
646

    
647

    
648

    
649
	/**
650
	 * This function returns the score of a quantitative feature.
651
	 * 
652
	 * @param feature
653
	 * @param coveredTaxa
654
	 * @param quantitativeFeaturesThresholds
655
	 * @return
656
	 */
657
	private float quantitativeFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa, Map<Feature,Float> quantitativeFeaturesThresholds){
658
		List<Float> allValues = new ArrayList<Float>();
659
		boolean lowerboundarypresent;
660
		boolean upperboundarypresent;
661
		float lowerboundary=0;
662
		float upperboundary=0;
663
		for (TaxonDescription td : coveredTaxa){
664
			Set<DescriptionElementBase> elements = td.getElements();
665
			for (DescriptionElementBase deb : elements){
666
				if (deb.getFeature().equals(feature)) {
667
					if (deb.isInstanceOf(QuantitativeData.class)) {
668
						QuantitativeData qd = (QuantitativeData)deb;
669
						Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
670
						lowerboundarypresent = false;
671
						upperboundarypresent = false;
672
						for (StatisticalMeasurementValue smv : values){
673
							StatisticalMeasure type = smv.getType();
674
							// DONT FORGET sample size, MEAN etc
675
							if (type.equals(StatisticalMeasure.MAX())) {
676
								upperboundary = smv.getValue();
677
								upperboundarypresent=true;
678
							}
679
							if (type.equals(StatisticalMeasure.MIN())) {
680
								lowerboundary = smv.getValue();
681
								lowerboundarypresent=true;
682
							}
683
							if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
684
								upperboundary = smv.getValue();
685
								upperboundarypresent=true;
686
							}
687
							if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
688
								lowerboundary = smv.getValue();
689
								lowerboundarypresent=true;
690
							}
691
							if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
692
								lowerboundary = smv.getValue();
693
								upperboundary = lowerboundary;
694
								lowerboundarypresent=true;
695
								upperboundarypresent=true;
696
							}
697
						}
698
						if (lowerboundarypresent && upperboundarypresent) {
699
							allValues.add(lowerboundary);
700
							allValues.add(upperboundary);
701
						}
702
					}
703
				}
704
			}
705
		}
706
		int i,j;
707
		float threshold=0;
708
		float bestThreshold=0;
709
		int difference=allValues.size();
710
		int differenceMin = difference;
711
		int taxaBefore=0;
712
		int taxaAfter=0;
713
		for (i=0;i<allValues.size()/2;i++) {
714
			threshold = allValues.get(i*2+1);
715
			taxaBefore=0;
716
			taxaAfter=0;
717
			for (j=0;j<allValues.size()/2;j++) {
718
				if (allValues.get(j*2+1)<=threshold){
719
					taxaBefore++;
720
				}
721
				if (allValues.get(j*2)>threshold){
722
					taxaAfter++;
723
				}
724
			}
725
			difference = Math.abs(taxaBefore-taxaAfter);
726
			if (difference<differenceMin){
727
				differenceMin=difference;
728
				bestThreshold = threshold;
729
			}
730
		}
731
		quantitativeFeaturesThresholds.put(feature, bestThreshold);
732
		int defaultQuantitativeScore=0;
733
		for (i=0;i<taxaBefore;i++) {
734
			defaultQuantitativeScore += taxaAfter - i;
735
		}
736
		return (float)(defaultQuantitativeScore);
737
	}
738

    
739

    
740

    
741
	/**
742
	 * This function returns the score of a categorical feature.
743
	 * 
744
	 * @param feature
745
	 * @param coveredTaxa
746
	 * @return
747
	 */
748
	private float categoricalFeatureScore(Feature feature, Set<TaxonDescription> coveredTaxa){
749
		int i,j;
750
		float score =0;
751
		float power=0;
752
		TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
753
		for (i=0 ; i<coveredTaxaArray.length; i++){
754
			Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
755
			DescriptionElementBase deb1 = null;
756
			for (DescriptionElementBase deb : elements1){
757
				if (deb.getFeature().equals(feature)){
758
					deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
759
				}
760
			}
761
			for (j=i+1 ; j< coveredTaxaArray.length ; j++){
762
				Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
763
				DescriptionElementBase deb2 = null;
764
				for (DescriptionElementBase deb : elements2){
765
					if (deb.getFeature().equals(feature)){
766
						deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
767
					}
768
				}
769
				power = defaultPower(deb1,deb2);
770
				score = score + power;
771
			}
772
		}
773
		return score;
774
	}
775

    
776

    
777
	/**
778
	 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
779
	 * 
780
	 * @param node
781
	 */
782
	private void checkDependencies(FeatureNode node){
783
		if (node.getOnlyApplicableIf()!=null){
784
			Set<State> addToOAI = node.getOnlyApplicableIf();
785
			for (State state : addToOAI){
786
				if (oAIdependencies.containsKey(state)) oAIdependencies.put(state, new HashSet<Feature>());
787
				oAIdependencies.get(state).add(node.getFeature());
788
			}
789
		}
790
		if (node.getInapplicableIf()!=null){
791
			Set<State> addToiI = node.getInapplicableIf();
792
			for (State state : addToiI){
793
				if (iIdependencies.containsKey(state)) iIdependencies.put(state, new HashSet<Feature>());
794
				iIdependencies.get(state).add(node.getFeature());
795
			}
796
		}
797
		if (node.getChildren()!=null) {
798
			for (FeatureNode fn : node.getChildren()){
799
				checkDependencies(fn);
800
			}
801
		}
802
	}
803

    
804

    
805

    
806
	/**
807
	 * This function fills the exclusions map.
808
	 * 
809
	 * @param feature
810
	 * @param coveredTaxa
811
	 * @param exclusions
812
	 * @return
813
	 */
814
	private float featureScoreAndMerge(Feature feature, Set<TaxonDescription> coveredTaxa, Map<State,Set<State>> exclusions){
815
		int i,j;
816
		float score =0;
817
		float power=0;
818
		TaxonDescription[] coveredTaxaArray = coveredTaxa.toArray(new TaxonDescription[coveredTaxa.size()]); // I did not figure a better way to do this
819
		for (i=0 ; i<coveredTaxaArray.length; i++){
820
			Set<DescriptionElementBase> elements1 = coveredTaxaArray[i].getElements();
821
			DescriptionElementBase deb1 = null;
822
			for (DescriptionElementBase deb : elements1){
823
				if (deb.getFeature().equals(feature)){
824
					deb1 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
825
				}
826
			}
827
			for (j=i+1 ; j< coveredTaxaArray.length ; j++){
828
				Set<DescriptionElementBase> elements2 = coveredTaxaArray[j].getElements();
829
				DescriptionElementBase deb2 = null;
830
				for (DescriptionElementBase deb : elements2){
831
					if (deb.getFeature().equals(feature)){
832
						deb2 = deb; // finds the DescriptionElementBase corresponding to the concerned Feature
833
					}
834
				}
835
				power = defaultPower(deb1,deb2);
836
				score = score + power;
837
				if (power>0) // if there is no state in common between deb1 and deb2
838
				{
839
					CategoricalData cat1 = (CategoricalData)deb1;
840
					CategoricalData cat2 = (CategoricalData)deb2;
841
					for (StateData statedata1 : cat1.getStates()){
842
						State state1 = statedata1.getState();
843
						if (!exclusions.containsKey(state1)){
844
							exclusions.put(state1, new HashSet<State>());
845
						}
846
						for (StateData statedata2 : cat2.getStates()){
847
							State state2 = statedata2.getState();
848
							if (!exclusions.containsKey(state2)){
849
								exclusions.put(state2, new HashSet<State>());
850
							}
851
							exclusions.get(state1).add(state2);
852
							exclusions.get(state2).add(state1);
853
						}
854
					}
855
				}
856
			}
857
		}
858
		return score;
859
	}
860

    
861

    
862

    
863
	/**
864
	 * Returns the score between two DescriptionElementBase. If one of them is null, returns -1.
865
	 * If they are not of the same type (Categorical) returns 0.
866
	 * 
867
	 * @param deb1
868
	 * @param deb2
869
	 * @return
870
	 */
871
	private float defaultPower(DescriptionElementBase deb1, DescriptionElementBase deb2){
872
		if (deb1==null || deb2==null) {
873
			return -1; //what if the two taxa don't have this feature in common ?
874
		}
875
		if ((deb1.isInstanceOf(CategoricalData.class))&&(deb2.isInstanceOf(CategoricalData.class))) {
876
			return defaultCategoricalPower((CategoricalData)deb1, (CategoricalData)deb2);
877
		}
878
		else return 0;
879
	}
880

    
881
	/**
882
	 * Returns the score of a categorical feature.
883
	 * 
884
	 * @param deb1
885
	 * @param deb2
886
	 * @return
887
	 */
888
	private float defaultCategoricalPower(CategoricalData deb1, CategoricalData deb2){
889
		List<StateData> states1 = deb1.getStates();
890
		List<StateData> states2 = deb2.getStates();
891
		boolean bool = false;
892
		Iterator<StateData> stateData1Iterator = states1.iterator() ;
893
		//		while (!bool && stateData1Iterator.hasNext()) {
894
		//			Iterator<StateData> stateData2Iterator = states2.iterator() ;
895
		//			StateData stateData1 = stateData1Iterator.next();
896
		//			while (!bool && stateData2Iterator.hasNext()) {
897
		//				bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
898
		//			}
899
		//		}
900
		// one point each time two taxa can be discriminated for a given feature
901

    
902
		boolean checkFeature = false;
903

    
904
		if (!featureDependencies.containsKey(deb1.getFeature())){
905
			featureDependencies.put(deb1.getFeature(), new HashSet<Feature>());
906
			checkFeature = true;
907
		}
908

    
909
		while (stateData1Iterator.hasNext()) {
910
			Iterator<StateData> stateData2Iterator = states2.iterator() ;
911
			StateData stateData1 = stateData1Iterator.next();
912
			State state1 = stateData1.getState();
913
			if (checkFeature){
914
				if (iIdependencies.get(state1)!=null) {
915
					featureDependencies.get(deb1.getFeature()).addAll(iIdependencies.get(state1));
916
				}
917
				if (oAIdependencies.get(state1)!=null) {
918
					featureDependencies.get(deb1.getFeature()).addAll(oAIdependencies.get(state1));
919
				}
920
			}
921
			while (stateData2Iterator.hasNext()) {
922
				State state2 = stateData1.getState();
923
				bool = bool || state1.equals(state2); // checks if the states are the same
924
			}
925
		}
926

    
927

    
928
		if (bool) return 0;
929
		else return 1;
930
	}
931

    
932
	// old code used when PolytomousKey extended FeatureTree
933
	//	private void printTree(List<PolytomousKeyNode> fnodes, String spaces){
934
	//		if (!fnodes.isEmpty()){
935
	//			level++;
936
	//			int levelcopy = level;
937
	//			int j=1;
938
	//			String delimiter;
939
	//			String equals = " = ";
940
	//			String quantitative = "";
941
	//			String newspaces = spaces.concat("\t");
942
	//			for (PolytomousKeyNode fnode : fnodes){
943
	//				if (fnode.getFeature()!=null) {
944
	//					String state = null;
945
	//					if (fnode.getQuestion(Language.DEFAULT())!=null) state = fnode.getQuestion(Language.DEFAULT()).getLabel();
946
	//					if (fnode.getFeature().isSupportsQuantitativeData()) delimiter = quantitative;
947
	//					else delimiter = equals;
948
	//					System.out.println(newspaces + levelcopy + " : " + j + " " + fnode.getFeature().getLabel() + delimiter + state);
949
	//					j++;
950
	//				}
951
	//				else { // TODO never read ?
952
	//					if (fnode.getQuestion(Language.DEFAULT())!=null) System.out.println(newspaces + "-> " + fnode.getQuestion(Language.DEFAULT()).getLabel());
953
	//				}
954
	//				printTree(fnode.getChildren(),newspaces);
955
	//			}
956
	//		}
957
	//	}
958

    
959
}
960

    
    (1-1/1)