Project

General

Profile

Download (59.8 KB) Statistics
| Branch: | Tag: | Revision:
1
package eu.etaxonomy.cdm.strategy.generate;
2

    
3
import java.math.BigDecimal;
4
import java.util.ArrayList;
5
import java.util.Arrays;
6
import java.util.Collection;
7
import java.util.Comparator;
8
import java.util.HashMap;
9
import java.util.HashSet;
10
import java.util.Iterator;
11
import java.util.List;
12
import java.util.Map;
13
import java.util.Map.Entry;
14
import java.util.Set;
15
import java.util.UUID;
16

    
17
import org.apache.logging.log4j.LogManager;import org.apache.logging.log4j.Logger;
18

    
19
import eu.etaxonomy.cdm.common.BigDecimalUtil;
20
import eu.etaxonomy.cdm.model.common.CdmBase;
21
import eu.etaxonomy.cdm.model.common.Language;
22
import eu.etaxonomy.cdm.model.description.CategoricalData;
23
import eu.etaxonomy.cdm.model.description.DescriptionBase;
24
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
25
import eu.etaxonomy.cdm.model.description.Feature;
26
import eu.etaxonomy.cdm.model.description.FeatureState;
27
import eu.etaxonomy.cdm.model.description.KeyStatement;
28
import eu.etaxonomy.cdm.model.description.PolytomousKey;
29
import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
30
import eu.etaxonomy.cdm.model.description.QuantitativeData;
31
import eu.etaxonomy.cdm.model.description.SpecimenDescription;
32
import eu.etaxonomy.cdm.model.description.State;
33
import eu.etaxonomy.cdm.model.description.StateData;
34
import eu.etaxonomy.cdm.model.description.StatisticalMeasure;
35
import eu.etaxonomy.cdm.model.description.StatisticalMeasurementValue;
36
import eu.etaxonomy.cdm.model.description.TaxonDescription;
37
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
38
import eu.etaxonomy.cdm.model.taxon.Taxon;
39
import eu.etaxonomy.cdm.model.taxon.TaxonNode;
40
import eu.etaxonomy.cdm.model.term.TermNode;
41

    
42
/**
43
 * @author m.venin
44
 * @author a.mueller
45
 */
46
public class PolytomousKeyGenerator {
47

    
48
    private static final Logger logger = LogManager.getLogger(PolytomousKeyGenerator.class);
49

    
50
    /**
51
     * Strings used for generating the statements of the key.
52
     */
53
    private static final String before="<";
54
    private static final String after=">";
55
    private static final String separator = " or ";
56

    
57

    
58
    private PolytomousKeyGeneratorConfigurator config;
59

    
60
	private Map<FeatureState,Set<Feature>> iAifDependencies = new HashMap<>(); // map of a set of Features (value) inapplicables if a State (key) is present
61
	private Map<FeatureState,Set<Feature>> oAifDependencies = new HashMap<>(); // map of a set of Features (value) only applicables if a State (key) is present
62
	private Map<Feature,Set<Feature>> featureDependencies = new HashMap<>(); // map of all the sets of features (values) which have dependencies with states of other features (keys)
63

    
64
	private class KeyTaxon{
65
	    private UUID uuid;
66
	    private Taxon taxon;
67
	    private SpecimenOrObservationBase<?> specimen;
68
	    private Map<Feature,Set<CategoricalData>> categoricalData = new HashMap<>();
69
        private Map<Feature,Set<QuantitativeData>> quantitativeData = new HashMap<>();
70
        private Set<KeyTaxon> children = new HashSet<>();
71

    
72
        private Set<CategoricalData> getCategoricalData(Feature feature){
73
            return categoricalData.get(feature) == null? new HashSet<>():categoricalData.get(feature);
74
        }
75
        private Set<QuantitativeData> getQuantitativeData(Feature feature){
76
            return quantitativeData.get(feature) == null? new HashSet<>():quantitativeData.get(feature);
77
        }
78

    
79
        private void addDescription(DescriptionBase<?> db) {
80
            for (DescriptionElementBase deb : db.getElements()){
81
                Feature feature = deb.getFeature();
82
                if (deb.isCharacterData()){
83
                    if (deb.isInstanceOf(CategoricalData.class)){
84
                        CategoricalData cd = CdmBase.deproxy(deb, CategoricalData.class);
85
                        if (categoricalData.get(feature)== null){
86
                            categoricalData.put(feature, new HashSet<>());
87
                        }
88
                        categoricalData.get(feature).add(cd);
89
                    }else if (deb.isInstanceOf(QuantitativeData.class)){
90
                        QuantitativeData qd = CdmBase.deproxy(deb, QuantitativeData.class);
91
                        if (quantitativeData.get(feature)== null){
92
                            quantitativeData.put(feature, new HashSet<>());
93
                        }
94
                        quantitativeData.get(feature).add(qd);
95
                    }
96
                }
97
            }
98
        }
99

    
100
        @Override
101
        public String toString() {
102
            return taxon.getTitleCache(); // + "KeyTaxon [uuid=" + uuid + (taxon != null? ", " + taxon.getTitleCache():"") + "]";  // uuid=" + uuid +
103
        }
104
	}
105

    
106
// *************************** METHODS ***************************************/
107

    
108
    /**
109
     * Creates the key
110
     */
111
    public PolytomousKey invoke(PolytomousKeyGeneratorConfigurator config){
112
        if (config == null){
113
            throw new NullPointerException("PolytomousKeyGeneratorConfigurator must not be null");
114
        }
115
        this.config = config;
116
        if (this.config.isUseDependencies()){
117
            createDependencies(config.getDependenciesTree().getRoot());
118
        }
119
        PolytomousKey polytomousKey = PolytomousKey.NewInstance();
120
        PolytomousKeyNode root = polytomousKey.getRoot();
121
        @SuppressWarnings("unchecked")
122
        Set<KeyTaxon> taxaCovered = makeKeyTaxa((Set)config.getTaxonDescriptions());
123
        taxaCovered = replaceSingleRoot(taxaCovered);
124

    
125
        //filter if a feature is available only for certain states in this branche
126
        Map<Feature, Set<State>> featureStatesFilter = new HashMap<>();
127
        //TODO what if size(taxaCovered) <= 1, is this covered by algo? Write test to check
128
        buildBranches(root, config.getFeatures(), taxaCovered, featureStatesFilter);
129
        return polytomousKey;
130
    }
131

    
132
    /**
133
     * If the root taxon is a single taxon but has children
134
     * it will be replaced by it's children.
135
     */
136
    private Set<KeyTaxon> replaceSingleRoot(Set<KeyTaxon> taxaCovered) {
137
        if (this.config.isUseTaxonHierarchy() && taxaCovered.size()==1
138
                && !taxaCovered.iterator().next().children.isEmpty()){
139
            return replaceSingleRoot(taxaCovered.iterator().next().children);
140
        }else{
141
            return taxaCovered;
142
        }
143
    }
144

    
145
    private Set<KeyTaxon> makeKeyTaxa(Set<DescriptionBase<?>> descriptions) {
146

    
147
        Map<UUID,KeyTaxon> taxonMap = new HashMap<>();
148
        for (DescriptionBase<?> db : descriptions){
149
            KeyTaxon taxon = new KeyTaxon();
150
            if (db.isInstanceOf(TaxonDescription.class)){
151
                TaxonDescription td = CdmBase.deproxy(db, TaxonDescription.class);
152
                taxon.uuid = td.getTaxon().getUuid();
153
                taxon.taxon = td.getTaxon();
154
            }else if (db.isInstanceOf(SpecimenDescription.class)){
155
                SpecimenDescription sd = CdmBase.deproxy(db, SpecimenDescription.class);
156
                taxon.uuid = sd.getDescribedSpecimenOrObservation().getUuid();
157
                taxon.specimen = sd.getDescribedSpecimenOrObservation();
158
            }else{
159
                throw new RuntimeException("Unhandled entity type " + db.getClass().getName());
160
            }
161
            if (taxonMap.get(taxon.uuid)!= null){
162
                taxon = taxonMap.get(taxon.uuid);
163
            }else{
164
                taxonMap.put(taxon.uuid, taxon);
165
            }
166
            taxon.addDescription(db);
167
        }
168

    
169
        createTaxonHierarchy(taxonMap);
170

    
171
        return new HashSet<>(taxonMap.values());
172
    }
173

    
174
    private void createTaxonHierarchy(Map<UUID, KeyTaxon> taxonMap) {
175
        if(config.isUseTaxonHierarchy()==false){
176
            return;
177
        }
178
        Set<KeyTaxon> taxaToTest = new HashSet<>(taxonMap.values());
179
        for(KeyTaxon taxon:taxaToTest){
180
            KeyTaxon parent = getBestTaxonParent(taxon, taxaToTest);
181
            if (parent != null){
182
                parent.children.add(taxon);
183
                taxonMap.remove(taxon.uuid);
184
            }
185
        }
186
    }
187

    
188
    private KeyTaxon getBestTaxonParent(KeyTaxon taxon, Collection<KeyTaxon> values) {
189
        KeyTaxon parent = null;
190
        String parentIndex = "";
191
        String myTreeIndex = getTaxonTreeIndex(taxon);
192
        if (myTreeIndex != null) {
193
            for (KeyTaxon candidate:values){
194
                String candidateIndex = getTaxonTreeIndex(candidate);
195
                if (candidateIndex == null || myTreeIndex.equals(candidateIndex)){
196
                    continue;
197
                }
198
                if (myTreeIndex.startsWith(candidateIndex)){
199
                    if (candidateIndex.length()> parentIndex.length()){
200
                        parent = candidate;
201
                        parentIndex = candidateIndex;
202
                    }
203
                }
204
            }
205
        }
206
        return parent;
207
    }
208

    
209
    private String getTaxonTreeIndex(KeyTaxon taxon) {
210
        if (taxon.taxon.getTaxonNodes().isEmpty()){
211
            return null;
212
        }
213
        //TODOO size>1  or classification check
214
        TaxonNode node = taxon.taxon.getTaxonNodes().iterator().next();
215
        String treeIndex = node.treeIndex();
216
        if (treeIndex == null){
217
            //unpersisted, this should only happen during test, create provisional treeindex
218
            treeIndex = getParentTreeIndex(node) + node.getUuid().toString() + "#" ;
219
        }
220
        return treeIndex;
221
    }
222

    
223
    private String getParentTreeIndex(TaxonNode node) {
224
        TaxonNode parent = node.getParent();
225
        if (parent == null ){
226
            return "#";
227
        }else{
228
            return getParentTreeIndex(parent) + parent.getUuid().toString() + "#" ;
229
        }
230
    }
231

    
232
    /**
233
	 * Recursive function that builds the branches of the identification key
234
	 *
235
	 * @param parent the node considered
236
	 * @param featuresLeft List of features that can be used at this point
237
	 * @param taxaCovered the taxa left at this point (i.e. that verify the description corresponding to the path leading to this node)
238
     * @param featureStatesFilter
239
	 */
240
	private void buildBranches(PolytomousKeyNode parent, List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
241
	        Map<Feature, Set<State>> featureStatesFilter){
242

    
243
	    //handle all branches taxa =>
244
        Set<KeyTaxon> allBranchesTaxa = getAllBranchesTaxa(featuresLeft, taxaCovered, featureStatesFilter);
245
        if (allBranchesTaxa.size()>0){
246
            if (allBranchesTaxa.size()>1){
247
            //TODO test if this case in handled and displayed correctly
248
                logger.warn(">1 final taxa in inner node");
249
            }
250
            taxaCovered.removeAll(allBranchesTaxa);
251
            if(taxaCovered.size() != 1){
252
                handleLeaf(parent, allBranchesTaxa);
253
            }else{
254
                //if only 1 is left it is better to handle all remaining in sub-branch to make difference clearer
255
                taxaCovered.addAll(allBranchesTaxa);
256
            }
257
        }
258

    
259
        //start real branching
260
	    if (taxaCovered.size()<=1){
261
		    //do nothing
262
	        logger.warn("Only 1 or no taxon covered. This should currently only be possible on top level and is not yet handled. ");
263
		}else {
264
			// this map stores the thresholds giving the best dichotomy of taxa for the corresponding feature supporting quantitative data
265
			Map<Feature,BigDecimal> quantitativeFeaturesThresholds = new HashMap<>();
266
			// the scores of the different features are calculated, the thresholds in the same time
267
			if (config.isDebug()){
268
			    System.out.println("Feature left: " + featuresLeft + ", taxa: " + taxaCovered);
269
			}
270
			Feature winnerFeature = computeScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
271

    
272
			if(winnerFeature != null){
273
			    /************** either the feature supports quantitative data... **************/
274
			    // NB: in this version, "quantitative features" are dealt with in a dichotomous way
275
			    if (winnerFeature.isSupportsQuantitativeData()) {
276
			        featuresLeft.add(winnerFeature);  //as quantitative data are currently only split in 2 parts there might be further splits possible and needed
277
			        handleQuantitativeData(parent, featuresLeft, taxaCovered,
278
			                quantitativeFeaturesThresholds, winnerFeature, featureStatesFilter);
279
			    }
280
			    /************** ...or it supports categorical data. **************/
281
			    else  if (winnerFeature.isSupportsCategoricalData()) {
282
			        handleCategorialFeature(parent, featuresLeft, taxaCovered,
283
			                winnerFeature, featureStatesFilter);
284
			    }else{
285
	                throw new RuntimeException("Winner feature does not support character data.");
286
			    }
287
			    // the winner features are put back to the features left once the branch is done
288
			    if (!featuresLeft.contains(winnerFeature)){  //TODO why is a list and not a set?
289
			        featuresLeft.add(winnerFeature);
290
			    }
291
			}else if (featuresLeft.isEmpty()){
292
			    handleLeaf(parent, taxaCovered);
293
			}else{
294
			    throw new RuntimeException("No winner feature but features left to handle should not happen.");
295
			}
296
		}
297
        return;
298
	}
299

    
300
    private Set<KeyTaxon> getAllBranchesTaxa(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
301
            Map<Feature, Set<State>> featureStatesFilter) {
302

    
303
        Set<KeyTaxon> candidates = new HashSet<>(taxaCovered);
304
        List<Feature> dependendFeatures = new ArrayList<>();
305
        for (Feature feature : featuresLeft){
306
            if(feature.isSupportsCategoricalData()){
307
                Set<State> allStates = getAllStates(feature, taxaCovered, featureStatesFilter.get(feature));
308
                Iterator<KeyTaxon> it = candidates.iterator();
309
                while (it.hasNext()){
310
                    Set<KeyTaxon> taxonSet = new HashSet<>(Arrays.asList(it.next()));
311
                    Set<State> taxonStates = getAllStates(feature, taxonSet, featureStatesFilter.get(feature));
312
                    if(allStates.size() > taxonStates.size()){
313
                        it.remove();
314
                    }
315
                }
316
                if(candidates.isEmpty()){
317
                    break;
318
                }else{
319
                    addDependentFeatures(dependendFeatures, feature, new HashSet<>(), new ArrayList<>(allStates));
320
                }
321
            }else if (feature.isSupportsQuantitativeData()){
322
                Iterator<KeyTaxon> it = candidates.iterator();
323
                while (it.hasNext()){
324
                    BigDecimal min = BigDecimalUtil.MAX_BIGDECIMAL;
325
                    BigDecimal max = BigDecimalUtil.MIN_BIGDECIMAL;
326
                    Set<QuantitativeData> qds = it.next().quantitativeData.get(feature);
327
                    qds = qds == null? new HashSet<>(): qds;
328
                    for (QuantitativeData qd : qds){
329
                        BigDecimal qdMin = qd.getOverallMin();
330
                        if(qdMin != null){
331
                            min = min.min(qdMin);
332
                        }
333
                        BigDecimal qdMax = qd.getOverallMax();
334
                        if(qdMax != null){
335
                            max = max.max(qdMax);
336
                        }
337
                    }
338
                    boolean staysCandidate = true;
339
                    for(KeyTaxon taxon : taxaCovered){
340
                        Set<QuantitativeData> tqds = taxon.quantitativeData.get(feature);
341
                        tqds = tqds == null? new HashSet<>(): tqds;
342
                        for (QuantitativeData qd : tqds){
343
                            staysCandidate &= qd.getOverallMin() == null || qd.getOverallMin().compareTo(min) >= 0;
344
                            staysCandidate &= qd.getOverallMax() == null || qd.getOverallMax().compareTo(max) <= 0;
345
                        }
346
                        if (!staysCandidate){
347
                            break;
348
                        }
349
                    }
350
                    if (!staysCandidate){
351
                        it.remove();
352
                    }
353
                }
354
            }
355
        }
356
        if(config.isUseDependencies() && !dependendFeatures.isEmpty() && !candidates.isEmpty()){
357
            Set<KeyTaxon> dependetCandidates = getAllBranchesTaxa(dependendFeatures, taxaCovered, featureStatesFilter);
358
            candidates.retainAll(dependetCandidates);
359
        }
360
        return candidates;
361
    }
362

    
363
    /**
364
     * Creates a leaf. It adds the taxa the parent taxon as linked taxa. Handles a
365
     * list of multiple taxa and handles "specimen taxa" (not yet fully implemented)
366
     * @return taxa which exist in ALL sub-branches and therefore can be linked on higher level
367
     */
368
	private Set<KeyTaxon> handleLeaf(PolytomousKeyNode parent, Set<KeyTaxon> taxaCovered) {
369
        KeyStatement parentStatement = parent.getStatement();
370
        for(KeyTaxon taxon: taxaCovered){
371
            if  (taxon.taxon != null){
372
                parent.setOrAddTaxon(taxon.taxon);
373
            }else{
374
                //FIXME handle other descriptions like specimen descriptions better
375
                if (parentStatement!=null){
376
                    String statementString = parentStatement.getLabelText(Language.DEFAULT());
377
                    if (statementString !=null && taxon.specimen != null){
378
                        String label = statementString + " --> " + taxon.specimen.getTitleCache();
379
                        parentStatement.putLabel(Language.DEFAULT(), label);
380
                    }
381
                }
382
            }
383
        }
384
        return taxaCovered;
385
    }
386

    
387
    /**
388
     * "categorical features" may present several different states/branches,
389
     * each one of these might correspond to one child.
390
     */
391
    private void handleCategorialFeature(PolytomousKeyNode parent, List<Feature> featuresLeft,
392
            Set<KeyTaxon> taxaCovered,
393
            Feature winnerFeature,
394
            Map<Feature, Set<State>> featureStatesFilter) {
395

    
396
        Map<Set<KeyTaxon>,Boolean> reuseWinner = new HashMap<>();
397

    
398
        Set<State> allStates = getAllStates(winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
399
		// a map is created, the key being the set of taxa that present the state(s) stored in the corresponding value
400
        // this key represents a single branch in the decision tree
401
		Map<Set<KeyTaxon>, List<State>> taxonStatesMap
402
		        = determineCategoricalStates(allStates, winnerFeature, taxaCovered, featureStatesFilter.get(winnerFeature));
403

    
404
		if (taxonStatesMap.size()<=1){
405
		    if (notEmpty(featureDependencies.get(winnerFeature))){
406
		        //TODO is empty list correctly handled here? Seems to happen if incorrect data (e.g. only Max values) for quantdata exist
407
		        List<State> stateList = taxonStatesMap.isEmpty()? new ArrayList<>(): taxonStatesMap.values().iterator().next();
408
		        Set<Feature> featuresAdded = new HashSet<>();
409
		        addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, stateList);
410
		        featuresLeft.remove(winnerFeature);
411
		        buildBranches(parent, featuresLeft, taxaCovered, featureStatesFilter);
412
		        removeAddedDependendFeatures(featuresLeft, featuresAdded);
413
		    }else{
414
		        //if only 1 branch is left we can handle this as a leaf, no matter how many taxa are left
415
		        handleLeaf(parent, taxaCovered);
416
		    }
417
		}else {
418
		    // if the merge option is ON, branches with the same discriminative power will be merged (see Vignes & Lebbes, 1989)
419
		    if (config.isMerge()){
420
		        taxonStatesMap = handleMerge(taxaCovered, winnerFeature, reuseWinner,
421
		                taxonStatesMap, allStates, featureStatesFilter.get(winnerFeature));
422
		    }
423
		    List<Set<KeyTaxon>> sortedKeys = sortKeys(taxonStatesMap);
424
            for (Set<KeyTaxon> newTaxaCovered : sortedKeys){
425
		        //handle each branch
426
                handleCategoricalBranch(parent, featuresLeft,
427
                        taxaCovered.size(), winnerFeature, reuseWinner, taxonStatesMap, newTaxaCovered, featureStatesFilter);
428
            }
429
		}
430
		return;
431
    }
432

    
433
    private Map<Set<KeyTaxon>, List<State>> handleMerge(Set<KeyTaxon> taxaCovered,
434
            Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
435
            Map<Set<KeyTaxon>, List<State>> taxonStatesMap, Set<State> allStates, Set<State> filter) {
436

    
437
        // creates a map between the different states of the winnerFeature and the sets of states "incompatible" with them
438
        Map<State,Set<State>> exclusions = new HashMap<>();
439
        computeExclusions(winnerFeature, taxaCovered, exclusions, filter);
440

    
441
        while (!exclusions.isEmpty()){
442
        	// looks for the largest clique, i.e. the state with less exclusions
443
        	List<State> clique = returnBestClique(exclusions);
444
        	if(clique.containsAll(allStates)){
445
        	    continue;
446
        	}
447
        	// then merges the corresponding branches
448
        	mergeBranches(clique, taxonStatesMap, reuseWinner, filter);
449
        }
450
        //during merge the keySet (set of taxa) may change, therefore they change their hashcode
451
        //and can not be used as keys in the map anymore.
452
        //Therefore we refill the map.
453
        taxonStatesMap = renewTaxonStatesMap(taxonStatesMap);
454
        return taxonStatesMap;
455
    }
456

    
457
    private void handleCategoricalBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
458
            int taxaCoveredSize,
459
            Feature winnerFeature, Map<Set<KeyTaxon>, Boolean> reuseWinner,
460
            Map<Set<KeyTaxon>, List<State>> taxonStatesMap,
461
            Set<KeyTaxon> newTaxaCovered,
462
            Map<Feature,Set<State>> featureStatesFilter) {
463

    
464
        //to restore old state
465
        Set<State> oldFilterSet = featureStatesFilter.get(winnerFeature);
466
        Set<Feature> featuresAdded = new HashSet<>();
467

    
468
        boolean areTheTaxaDiscriminated = false;
469
        PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
470
        parent.addChild(childNode);
471

    
472
        List<State> listOfStates = taxonStatesMap.get(newTaxaCovered);
473
        if ((newTaxaCovered.size() > 0)){ //old: if the taxa are discriminated compared to those of the parent node, a child is created
474
        	areTheTaxaDiscriminated = (newTaxaCovered.size() != taxaCoveredSize);
475

    
476
        	int numberOfStates = listOfStates.size()-1;
477
        	listOfStates.sort(stateComparator);
478

    
479
        	if (config.isUseDependencies()){
480
        	    // old: if the dependencies are considered, removes and adds the right features from/to the list of features left
481
                // these features are stored in order to be put back again when the current branch is finished
482

    
483
        	    addDependentFeatures(featuresLeft, winnerFeature, featuresAdded, listOfStates);
484
        	}
485

    
486
        	String statementLabel = createStatement(listOfStates, numberOfStates);
487
            KeyStatement statement = KeyStatement.NewInstance(statementLabel);
488
        	childNode.setStatement(statement);
489
        	parent.setFeature(winnerFeature);
490

    
491
        	if (reuseWinner.get(newTaxaCovered)== Boolean.TRUE){
492
        	    featuresLeft.add(winnerFeature);
493
        	    setStatesFilter(featureStatesFilter, winnerFeature, listOfStates);
494
        	}else{
495
        	    featuresLeft.remove(winnerFeature);
496
        	}
497
        }
498

    
499
        boolean hasChildren = areTheTaxaDiscriminated && (newTaxaCovered.size() > 1);
500
        if (hasChildren){
501
            buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
502
        }else{
503
            handleLeaf(childNode, newTaxaCovered);
504
            Set<KeyTaxon> taxonChildren = getTaxonChildren(newTaxaCovered);
505
            if(!taxonChildren.isEmpty()){
506
                //TODO FIXME featuresLeft probably needs to include all features, similar for featureStatesFilter
507
                buildBranches(childNode, featuresLeft, taxonChildren, featureStatesFilter);
508
            }
509
        }
510

    
511
        //restore old state before returning to parent node
512
        removeAddedDependendFeatures(featuresLeft, featuresAdded);
513
        featureStatesFilter.put(winnerFeature, oldFilterSet);
514

    
515
        return;
516
    }
517

    
518
    private Set<KeyTaxon> getTaxonChildren(Set<KeyTaxon> newTaxaCovered) {
519
        Set<KeyTaxon> result = new HashSet<>();
520
        for (KeyTaxon taxon:newTaxaCovered){
521
            result.addAll(taxon.children);
522
        }
523
        return result;
524
    }
525

    
526
    private void setStatesFilter(Map<Feature, Set<State>> filter, Feature feature,
527
            List<State> listOfStates) {
528
        if (filter.get(feature)==null){
529
            filter.put(feature, new HashSet<>(listOfStates));
530
        }else{
531
            Set<State> set = filter.get(feature);
532
            set.retainAll(listOfStates);
533
        }
534
    }
535

    
536
    private void removeAddedDependendFeatures(List<Feature> featuresLeft, Set<Feature> featuresAdded) {
537
        for (Feature addedFeature : featuresAdded) {
538
            featuresLeft.remove(addedFeature);
539
        }
540
    }
541

    
542
    private void addDependentFeatures(List<Feature> featuresLeft, Feature baseFeature,
543
            Set<Feature> featuresAdded, List<State> listOfStates) {
544

    
545
        if(notEmpty(featureDependencies.get(baseFeature))){
546
            Set<Feature> newFeatureCandidates = new HashSet<>(featureDependencies.get(baseFeature));
547
            newFeatureCandidates.remove(null);
548
            for (State state : listOfStates) {
549
                //in-applicable
550
                List<Feature> inapplicableFeatures = getApplicableFeatures(baseFeature, state, iAifDependencies);
551
                newFeatureCandidates.removeAll(inapplicableFeatures);
552
                //only-applicable
553
                List<Feature> onlyApplicableFeatures = getApplicableFeatures(baseFeature, state, oAifDependencies);
554
                if (!onlyApplicableFeatures.isEmpty()){
555
                    Iterator<Feature> it = newFeatureCandidates.iterator();
556
                    while (it.hasNext()){
557
                        Feature featureCandidate = it.next();
558
                        if (!onlyApplicableFeatures.contains(featureCandidate)){
559
                            it.remove();
560
                        }
561
                    }
562
                }
563
            }
564
            featuresLeft.addAll(newFeatureCandidates);
565
            featuresAdded.addAll(newFeatureCandidates);
566
        }
567
    }
568

    
569
    private List<Feature> getApplicableFeatures(Feature feature, State state,
570
            Map<FeatureState, Set<Feature>> applicabilityDependencies) {
571
        List<Feature> result = new ArrayList<>();
572
        for (FeatureState featureState : applicabilityDependencies.keySet()){
573
            if(featureState.getFeature().equals(feature) && featureState.getState().equals(state)){
574
                result.addAll(applicabilityDependencies.get(featureState));
575
            }
576
        }
577
        return result;
578
    }
579

    
580
    private boolean notEmpty(Set<?> set) {
581
        return (set != null) && !set.isEmpty();
582
    }
583

    
584
    private String createStatement(List<State> listOfStates, int numberOfStates) {
585
        StringBuilder statementLabel = new StringBuilder();
586
        for (State state : listOfStates) {
587
            statementLabel.append(state.getLabel());
588
            if (listOfStates.lastIndexOf(state)!=numberOfStates){
589
                statementLabel.append(separator); // append a separator after each state except for the last one
590
            }
591
        }
592
        return statementLabel.toString();
593
    }
594

    
595
    private Map<Set<KeyTaxon>, List<State>> renewTaxonStatesMap(Map<Set<KeyTaxon>, List<State>> taxonStatesMap) {
596
        Map<Set<KeyTaxon>, List<State>> result = new HashMap<>();
597
        for (Map.Entry<Set<KeyTaxon>, List<State>> entry : taxonStatesMap.entrySet()){
598
            result.put(entry.getKey(), entry.getValue());
599
        }
600
        return result;
601
    }
602

    
603
    private List<Set<KeyTaxon>> sortKeys(Map<Set<KeyTaxon>, List<State>> taxonStatesMap) {
604
        //for now this is a dummy sorting
605
        List<Map.Entry<Set<KeyTaxon>, List<State>>> sortedEntries = new ArrayList<>();
606
        sortedEntries.addAll(taxonStatesMap.entrySet());
607

    
608
        sortedEntries.sort(entryComparator);
609
        List<Set<KeyTaxon>> result = new ArrayList<>();
610
        for (Map.Entry<Set<KeyTaxon>, List<State>> entry : sortedEntries){
611
            result.add(entry.getKey());
612
        }
613
        return result;
614
    }
615

    
616
    //TODO use a general term comparator here
617
    private static final Comparator<State> stateComparator = (a,b)-> {
618

    
619
        //TODO use real Term Comparator
620
        if (!a.getTitleCache().equals(b.getTitleCache())){
621
            return a.getTitleCache().compareTo(b.getTitleCache());
622
        }
623
        if (a.getUuid()!= b.getUuid()){
624
            return a.getUuid().compareTo(b.getUuid());
625
        }
626
        return 0;
627
    };
628

    
629
    private static final Comparator<? super Entry<Set<KeyTaxon>, List<State>>> entryComparator =  (a,b)-> {
630
        if (a.getKey().size()!=b.getKey().size()){
631
            //order by number of taxa covered
632
            return b.getKey().size() - a.getKey().size();
633
        }else if (a.getValue().size()!= b.getValue().size()){
634
            //order by number of states covered
635
            return b.getValue().size() - a.getValue().size();
636
        }else{
637
            //order states alphabetically or by uuid
638
            for (int i = 0; i < a.getValue().size(); i++){
639
                State stateA = a.getValue().get(i);
640
                State stateB = b.getValue().get(i);
641
                int result = stateComparator.compare(stateA, stateB);
642
                if (result != 0){
643
                    return result;
644
                }
645
            }
646
            //TODO compare keys (sets of KeyTaxon)
647
//            for (int i = 0; i < a.getKey().size(); i++){
648
//                Object stateA = a.getKey().getUuid;
649
//                State stateB = a.getKey().get(i);
650
//                //TODO use real Term Comparator
651
//                if (stateA.getUuid()!= stateB.getUuid()){
652
//                    return stateA.getUuid().compareTo(stateB.getUuid());
653
//                }
654
//            }
655
            throw new RuntimeException("Compare for same state lists with different unit descriptions not yet implemented");
656
        }
657
    };
658

    
659
    private Set<State> getAllStates(Feature feature, Set<KeyTaxon> taxaCovered, Set<State> filter) {
660
        //TODO handle modifier
661
        Set<State> states = new HashSet<>();
662
        for (KeyTaxon taxon : taxaCovered){
663
            Set<CategoricalData> cdSet = taxon.getCategoricalData(feature);
664
            for (CategoricalData cd : cdSet){
665
                List<StateData> stateDatas = cd.getStateData();
666
                for (StateData sd : stateDatas){
667
                    State state = sd.getState();
668
                    if (filter != null && !filter.contains(state)) {
669
                        continue;
670
                    }
671
                    states.add(state);
672
                }
673
            }
674
        }
675
        return states;
676
    }
677

    
678
    private void handleQuantitativeData(PolytomousKeyNode parent, List<Feature> featuresLeft,
679
            Set<KeyTaxon> taxaCovered, Map<Feature, BigDecimal> quantitativeFeaturesThresholds,
680
            Feature winnerFeature, Map<Feature, Set<State>> featureStatesFilter) {
681

    
682
        // first, get the threshold
683
        BigDecimal threshold = quantitativeFeaturesThresholds.get(winnerFeature);
684
        //TODO unit not seems to be used yet
685
        StringBuilder unit = new StringBuilder();
686
        // then determine which taxa are before and which are after this threshold (dichotomy)
687
        //in order to create the children of the parent node
688
        List<Set<KeyTaxon>> quantitativeStates = determineQuantitativeStates(threshold, winnerFeature, taxaCovered, unit);
689
        // thus the list contains two sets of KeyTaxon, the first corresponding to
690
        // those before, the second to those after the threshold
691
        for (int i=0; i<2; i++) {
692
            handleQuantitativeBranch(parent, featuresLeft, taxaCovered.size(), winnerFeature, threshold, unit,
693
                    quantitativeStates, featureStatesFilter, i);
694
        }
695

    
696
        return;
697
    }
698

    
699
    /**
700
     * Creates the branch for a quantitative feature.
701
     * TODO if the quantitative feature has dependent features they are not yet handled
702
     */
703
    private void handleQuantitativeBranch(PolytomousKeyNode parent, List<Feature> featuresLeft,
704
            int parentTaxaCoveredSize, Feature winnerFeature, BigDecimal threshold, StringBuilder unit,
705
            List<Set<KeyTaxon>> quantitativeStates, Map<Feature, Set<State>> featureStatesFilter,
706
            int brunchNum) {
707

    
708
        String sign;
709
        Set<KeyTaxon> newTaxaCovered = quantitativeStates.get(brunchNum);
710
        if (brunchNum==0){
711
        	sign = before; // the first element of the list corresponds to taxa before the threshold
712
        } else {
713
        	sign = after; // the second to those after
714
        }
715
        if (newTaxaCovered.size()>0){
716
        	PolytomousKeyNode childNode = PolytomousKeyNode.NewInstance();
717
        	parent.setFeature(winnerFeature);
718
        	KeyStatement statement = KeyStatement.NewInstance( " " + sign + " " + threshold + unit);
719
        	childNode.setStatement(statement);
720
        	parent.addChild(childNode);
721
        	//TODO don't we need to check dependent features, they are not very likely for quantitative features, but still might exist as exception ...
722
        	boolean taxaAreDiscriminatedInThisStep = newTaxaCovered.size() < parentTaxaCoveredSize;
723
        	boolean childrenExist = taxaAreDiscriminatedInThisStep && (newTaxaCovered.size() > 1);
724
        	if (childrenExist){
725
        	    buildBranches(childNode, featuresLeft, newTaxaCovered, featureStatesFilter);
726
        	}else{
727
        	    handleLeaf(childNode, newTaxaCovered);
728
        	}
729
        }else{
730
            //TODO do we need to check the 0 case, can this happen at all, shouldn't we throw a warning instead?
731
            throw new RuntimeException("No taxa left on branch. This should probably not happen.");
732
        }
733
        return;
734
    }
735

    
736
    private Feature computeScores(List<Feature> featuresLeft, Set<KeyTaxon> taxaCovered,
737
            Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<State>> featureStatesFilter) {
738

    
739
        Map<Feature,Float> scoreMap = featureScores(featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
740
        dependenciesScores(scoreMap, featuresLeft, taxaCovered, quantitativeFeaturesThresholds, featureStatesFilter);
741
        // the feature with the best score becomes the one corresponding to the current node
742
        Feature winnerFeature = lessStatesWinner(scoreMap, taxaCovered);
743
        // the feature is removed from the list of features available to build the next level of the tree
744
        featuresLeft.remove(winnerFeature);
745
        if (config.isDebug()){System.out.println("   ScoreMap: " + scoreMap);}
746
        if (config.isDebug()){System.out.println("   quantitativeThreshold: " + quantitativeFeaturesThresholds);}
747
        if (config.isDebug()){System.out.println("   winner: " + winnerFeature);}
748
        return winnerFeature;
749
    }
750

    
751
	/**
752
	 * Dependencies can be seen as a hierarchy.
753
	 * If the dependencies are on, this function calculates the score of all children features and give the highest score
754
	 * of these to their parent (of course, if its score is lower). This way, if a feature has a good score but is
755
	 * "onlyApplicableIf" or "InapplicableIf", the feature it depends can be chosen in order to build a better key.
756
	 */
757
	private void dependenciesScores(Map<Feature,Float> scoreMap, List<Feature> featuresLeft,
758
	        Set<KeyTaxon> coveredTaxa, Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<State>> featureStatesFilter){
759

    
760
	    //TODO maybe we need to do this recursive?
761

    
762
	    //old: but I don't understand why we should include the existing featureList, this is only necessary
763
	    //if the single cores depend on the set of features
764
	    //List<Feature> pseudoFeaturesLeft = new ArrayList<>(featuresLeft);  //copies the list of features
765

    
766
	    // then adds all the features which depend on features contained in the list
767
		List<Feature> pseudoFeatures = new ArrayList<>();
768
		for (Feature feature : featuresLeft){
769
			if (featureDependencies.containsKey(feature)){
770
				for (Feature dependendFeature : featureDependencies.get(feature)){
771
				    if (!pseudoFeatures.contains(dependendFeature)){
772
				        pseudoFeatures.add(dependendFeature);
773
				    }
774
				}
775
			}
776
		}
777

    
778
		if (!pseudoFeatures.isEmpty()){
779
    		// then calculates the scores of all features that have been added
780
    		Map<Feature,Float> newScoreMap = featureScores(pseudoFeatures, coveredTaxa, quantitativeFeaturesThresholds, featureStatesFilter);
781
    		for (Feature parentFeature : featureDependencies.keySet()){
782
    			if (scoreMap.containsKey(parentFeature)){
783
    				for (Feature dependendFeature : featureDependencies.get(parentFeature)){
784
    					if (newScoreMap.containsKey(dependendFeature)) {
785
    						// if a feature has a better than the "parent" feature it depends on
786
    						if (newScoreMap.get(dependendFeature) > scoreMap.get(parentFeature)){
787
    							// the "parent" feature gets its score in the original score map
788
    							scoreMap.put(parentFeature, newScoreMap.get(dependendFeature));
789
    						}
790
    					}
791
    				}
792
    			}
793
    		}
794
		}
795
	}
796

    
797

    
798
	/**
799
	 * This function merges branches of the key belonging to the same clique
800
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
801
	 *
802
	 * @param clique the list of States linked together (i.e. if merged have the same score)
803
	 * @param taxonStatesMap the map between the taxa (keys) and the states (keys) leading to them
804
	 * @param reuseWinner
805
	 * @param filter
806
	 * @return <code>true</code>, if all taxa covered by the new branch include all states of the clique.
807
	 * <code>false</code> otherwise.
808
	 */
809
	private void mergeBranches(List<State> clique, Map<Set<KeyTaxon>, List<State>> taxonStatesMap,
810
	        Map<Set<KeyTaxon>, Boolean> reuseWinner, Set<State> filter){
811

    
812
	    boolean isExact = true;
813
	    if (clique.size()<=1){
814
	        return;
815
	    }
816
	    Map.Entry<Set<KeyTaxon>,List<State>> firstBranch = null;
817
	    List<Set<KeyTaxon>> tdToDelete = new ArrayList<>();
818
	    //TODO is the stateFilter needed here somehow?
819
	    for (Map.Entry<Set<KeyTaxon>, List<State>> branch : taxonStatesMap.entrySet()){
820
		    boolean stateFound = false;
821
			// looks for one state of the clique in this branch
822
			for(State state : clique){
823
			    if (branch.getValue().contains(state)) {
824
                    stateFound = true;
825
                    break;
826
                }
827
			}
828
			// if one state is found...
829
			if (stateFound == true){
830
				// ...for the first time, the branch becomes the one to which the others will be merged
831
				if (firstBranch == null){
832
					firstBranch = branch;
833
				}
834
				// ... else the branch is merged to the first one.
835
				else {
836
					int oldSize = firstBranch.getKey().size();
837
				    firstBranch.getKey().addAll(branch.getKey());
838
				    int newSize = firstBranch.getKey().size();
839
                    if (oldSize != newSize || newSize != branch.getKey().size()){
840
                        isExact = false;
841
                    }
842
				    firstBranch.getValue().addAll(branch.getValue());
843
					tdToDelete.add(branch.getKey());
844
				}
845
			}
846
		}
847
		// once this is done, the branches merged to the first one are deleted
848
		for (Set<KeyTaxon> td : tdToDelete){
849
			taxonStatesMap.remove(td);
850
		}
851
		if (!isExact && firstBranch != null){
852
		    reuseWinner.put(firstBranch.getKey(), Boolean.TRUE);
853
		}
854
	}
855

    
856
	/**
857
	 * This function looks for the largest clique of States
858
	 * (http://en.wikipedia.org/wiki/Clique_%28graph_theory%29)
859
	 * from the map of exclusions.
860
	 *
861
	 * @param exclusions map a state (key) to the set of states (value) which can not be merged with it without decreasing its score.
862
	 * @return
863
	 */
864
	private List<State> returnBestClique (Map<State,Set<State>> exclusions){
865
		int bestNumberOfExclusions=-1;
866
		int numberOfExclusions;
867
		List<State> clique = new ArrayList<>();
868

    
869
		// looks for the largest clique, i.e. the state with less exclusions
870
		State bestState=null;
871
		for (Map.Entry<State,Set<State>> pair :  exclusions.entrySet()){
872
			numberOfExclusions = pair.getValue().size();
873
			if ((bestNumberOfExclusions == -1) || numberOfExclusions < bestNumberOfExclusions) {
874
				bestNumberOfExclusions = numberOfExclusions;
875
				bestState = pair.getKey();
876
			}
877
		}
878

    
879
		clique.add(bestState);
880
		exclusions.remove(bestState);
881

    
882
		boolean isNotExcluded;
883
		// then starts building the clique by adding the states which do not exclude each other
884
		for (Map.Entry<State,Set<State>> pair : exclusions.entrySet()){
885
			isNotExcluded = true;
886
			for (State state : clique) {
887
				if (pair.getValue().contains(state)) {
888
                    isNotExcluded = false;
889
                }
890
			}
891
			if (isNotExcluded){
892
				clique.add(pair.getKey());
893
			}
894
		}
895
		for (State state : clique) {
896
			exclusions.remove(state);
897
		}
898
		return clique;
899
	}
900

    
901

    
902
	/**
903
	 * fills a map of the sets of taxa (key) presenting the different states (value) for the given feature.
904
	 *
905
	 * @param statesDone the list of states already done for this feature
906
	 * @param states2 the element from which the states are extracted
907
	 * @param feature the feature corresponding to the CategoricalData
908
	 * @param taxaCovered the base of taxa considered
909
	 * @param featureStatesFilter
910
	 * @return
911
	 */
912
	private Map<Set<KeyTaxon>,List<State>> determineCategoricalStates(
913
	        Set<State> states, Feature feature, Set<KeyTaxon> taxaCovered, Set<State> filter){
914

    
915
	    Map<Set<KeyTaxon>, List<State>> childrenStatesMap = new HashMap<>();
916
	    //TODO needed
917
	    List<State> statesDone = new ArrayList<>(); // the list of states already used
918

    
919
		for (State state : states){ // for each state
920
			if (filter != null && !filter.contains(state)){
921
			    continue;
922
			}
923
		    statesDone.add(state);
924
			Set<KeyTaxon> newTaxaCovered = taxaByFeatureState(feature, state, taxaCovered); //gets which taxa present this state
925
			List<State> statesOfTaxa = childrenStatesMap.get(newTaxaCovered);
926
			if (statesOfTaxa == null) { // if no states are associated to these taxa, create a new list
927
				statesOfTaxa = new ArrayList<>();
928
				childrenStatesMap.put(newTaxaCovered, statesOfTaxa);
929
			}
930
			statesOfTaxa.add(state); // then add the state to the list
931
		}
932
		return childrenStatesMap;
933
	}
934

    
935

    
936
	/**
937
	 * Returns the list of taxa from previously covered taxa, which have the state featureState for the given feature
938
	 */
939
	private Set<KeyTaxon> taxaByFeatureState(Feature feature, State featureState, Set<KeyTaxon> taxaCovered){
940
		Set<KeyTaxon> newCoveredTaxa = new HashSet<>();
941
		for (KeyTaxon td : taxaCovered){
942
			for (CategoricalData cd : td.getCategoricalData(feature)){
943
			    List<StateData> stateDatas = cd.getStateData();
944
                for (StateData sd : stateDatas) {
945
                    if (sd.getState().equals(featureState)){
946
                        newCoveredTaxa.add(td);
947
                    }
948
                }
949
			}
950
		}
951
		return newCoveredTaxa;
952
	}
953

    
954
	/**
955
	 * This function returns the feature with the highest score. However, if several features have the same score
956
	 * the one which leads to less options is chosen (this way, the key is easier to read).
957
	 */
958
	private Feature lessStatesWinner(Map<Feature,Float> scores, Set<KeyTaxon> taxaCovered){
959
		int nTaxa = taxaCovered.size();
960
		if (nTaxa==1) {
961
            return null;
962
        }
963
		float meanScore = defaultMeanScore(nTaxa);
964
		float bestScore = nTaxa*nTaxa;
965
		List<Feature> bestFeatures = new ArrayList<>(); // if ever different features have the best score, they are put in this list
966
		Feature bestFeature = null;
967
		float newScore;
968
		for (Map.Entry<Feature,Float> pair : scores.entrySet()){
969
			if (pair.getValue()!=null){
970
				newScore = Math.abs(pair.getValue()-meanScore);// the best score is the closest to the score (meanScore here)
971
				// a feature would have if it divided the taxa in two equal parts
972
				if (newScore < bestScore){
973
					bestFeatures.clear();
974
					bestFeatures.add(pair.getKey());
975
					bestScore = newScore;
976
				}else if (newScore==bestScore){
977
					bestFeatures.add(pair.getKey());
978
				}
979
			}
980
		}
981
		if (bestFeatures.size()==1) { // if there is only one feature with the best score, pick it
982
			return bestFeatures.get(0);
983
		}
984
		else { // else choose the one with less states
985
			int lessStates = -1;
986
			int numberOfDifferentStates=-1;
987
			for (Feature feature : bestFeatures){
988
				if (feature.isSupportsCategoricalData()){
989
					Set<State> differentStates = new HashSet<>();
990
					for (KeyTaxon taxon : taxaCovered){
991
						Set<CategoricalData> cds = taxon.getCategoricalData(feature);
992
						Set<StateData> allStateData = getStateData(cds);
993
						for (StateData sd : allStateData) {
994
							differentStates.add(sd.getState());
995
						}
996
					}
997
					numberOfDifferentStates=differentStates.size();
998
				}else if (feature.isSupportsQuantitativeData()){
999
					numberOfDifferentStates=2;
1000
				}
1001
				if (lessStates == -1 || numberOfDifferentStates < lessStates){
1002
					lessStates = numberOfDifferentStates;
1003
					bestFeature = feature;
1004
				}
1005
			}
1006
			return bestFeature;
1007
		}
1008
	}
1009

    
1010
	/**
1011
	 * This function calculates the mean score, i.e. the score a feature dividing the taxa in two equal parts
1012
	 * would have.
1013
	 */
1014
	private float defaultMeanScore(int nTaxa){
1015
		int i;
1016
		float score=0;
1017
		for (i=1;i<nTaxa;i++){
1018
			score = score + Math.round(i+1/2);
1019
		}
1020
		return score;
1021
	}
1022

    
1023
	/**
1024
	 * This function fills the map of features (keys) with their respecting scores (values)
1025
	 */
1026
	private Map<Feature,Float> featureScores(List<Feature> featuresLeft, Set<KeyTaxon> coveredTaxa,
1027
	        Map<Feature,BigDecimal> quantitativeFeaturesThresholds, Map<Feature, Set<State>> featureStatesFilter){
1028

    
1029
		Map<Feature,Float> scoreMap = new HashMap<>();
1030
		for (Feature feature : featuresLeft){
1031
			if (feature.isSupportsCategoricalData()) {
1032
				scoreMap.put(feature, categoricalFeatureScore(feature, coveredTaxa, featureStatesFilter.get(feature)));
1033
			}
1034
			if (feature.isSupportsQuantitativeData()){
1035
				scoreMap.put(feature, quantitativeFeatureScore(feature, coveredTaxa, quantitativeFeaturesThresholds));
1036
			}
1037
		}
1038
		return scoreMap;
1039
	}
1040

    
1041
	/**
1042
	 * Since Quantitative features do not have states, unlike Categorical ones, this function determines which taxa,
1043
	 * for a given quantitative feature, present either a lower or higher value than a given threshold.
1044
	 * It returns two Sets of {@link KeyTaxon}, one with the taxa under this threshold (taxaBefore) and another one
1045
	 * with the taxa over (taxaAfter).
1046
	 */
1047
	private List<Set<KeyTaxon>> determineQuantitativeStates (BigDecimal threshold, Feature feature,
1048
	        Set<KeyTaxon> taxa, StringBuilder unit){
1049

    
1050
	    List<Set<KeyTaxon>> list = new ArrayList<>();
1051
		Set<KeyTaxon> taxaBefore = new HashSet<>();
1052
		Set<KeyTaxon> taxaAfter = new HashSet<>();
1053
		list.add(taxaBefore);
1054
		list.add(taxaAfter);
1055
		for (KeyTaxon td : taxa){
1056
		    for (QuantitativeData qd : td.getQuantitativeData(feature)){
1057
		        if (unit.toString().equals("") && qd.getUnit()!=null && qd.getUnit().getLabel()!=null){
1058
                    unit.append(" " + qd.getUnit().getLabel());
1059
                }
1060
                Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1061
                for (StatisticalMeasurementValue smv : values){
1062
                    StatisticalMeasure type = smv.getType();
1063
                    //TODO DONT FORGET sample size, MEAN etc
1064
                    if (type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()) {
1065
                        if (smv.getValue().compareTo(threshold) > 0){
1066
                            taxaAfter.add(td);
1067
                        }
1068
                    }
1069
                    if (type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()) {
1070
                        if (smv.getValue().compareTo(threshold) <= 0){
1071
                            taxaBefore.add(td);
1072
                        }
1073
                    }
1074
                }
1075
		    }
1076
		}
1077
		return list;
1078
	}
1079

    
1080
	/**
1081
	 * This function returns the score of a quantitative feature.
1082
	 */
1083
	private float quantitativeFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa,
1084
	        Map<Feature,BigDecimal> quantitativeFeaturesThresholds){
1085

    
1086
	    List<BigDecimal> allValues = new ArrayList<>();
1087
		for (KeyTaxon td : coveredTaxa){
1088
		    for (QuantitativeData qd : td.getQuantitativeData(feature)){
1089
		        computeAllValues(allValues, qd);
1090
		    }
1091
		}
1092
		int i,j;
1093
		BigDecimal threshold = BigDecimal.ZERO;
1094
		BigDecimal bestThreshold = BigDecimal.ZERO;
1095
		int difference = allValues.size();
1096
		int differenceMin = difference;
1097
		int bestTaxaBefore = 0;
1098
		int bestTaxaAfter = 0;
1099
		for (i=0; i<allValues.size()/2; i++) {
1100
			threshold = allValues.get(i*2+1);
1101
			int taxaBefore=0;
1102
			int taxaAfter=0;
1103
			for (j=0;j<allValues.size()/2;j++) {
1104
				if (allValues.get(j*2+1).compareTo(threshold) <= 0){
1105
					taxaBefore++;
1106
				}
1107
				if (allValues.get(j*2).compareTo(threshold) > 0){
1108
					taxaAfter++;
1109
				}
1110
			}
1111
			difference = Math.abs(taxaBefore-taxaAfter);
1112
			if (difference < differenceMin){
1113
				differenceMin = difference;
1114
				bestThreshold = threshold;
1115
				bestTaxaBefore = taxaBefore;
1116
				bestTaxaAfter = taxaAfter;
1117
			}
1118
		}
1119
		quantitativeFeaturesThresholds.put(feature, bestThreshold);
1120
		int defaultQuantitativeScore=0;
1121
		for (i=0; i<bestTaxaBefore; i++) {
1122
			defaultQuantitativeScore += bestTaxaAfter;
1123
		}
1124
		return defaultQuantitativeScore;
1125
	}
1126

    
1127
    private void computeAllValues(List<BigDecimal> allValues, QuantitativeData qd) {
1128
        Set<StatisticalMeasurementValue> values = qd.getStatisticalValues();
1129
        BigDecimal lowerboundary = null;
1130
        BigDecimal upperboundary = null;
1131
        for (StatisticalMeasurementValue smv : values){
1132
        	StatisticalMeasure type = smv.getType();
1133
        	BigDecimal value = smv.getValue();
1134
        	// DONT FORGET sample size, MEAN etc
1135
        	if(type.isMin() || type.isTypicalLowerBoundary() || type.isAverage() || type.isExactValue()){
1136
        	    if (lowerboundary == null){
1137
        	        lowerboundary = value;
1138
        	    }else{
1139
        	        lowerboundary = lowerboundary.min(value);
1140
        	    }
1141
        	}
1142
        	if(type.isMax() || type.isTypicalUpperBoundary() || type.isAverage() || type.isExactValue()){
1143
                if (upperboundary == null){
1144
                    upperboundary = value;
1145
                }else{
1146
                    upperboundary = upperboundary.max(value);
1147
                }
1148
            }
1149

    
1150
//        	if (type.isMax()) {
1151
//        		upperboundary = smv.getValue();
1152
//        		upperboundarypresent=true;
1153
//        	}else if (type.equals(StatisticalMeasure.MIN())) {
1154
//        		lowerboundary = smv.getValue();
1155
//        		lowerboundarypresent=true;
1156
//        	}else if (type.equals(StatisticalMeasure.TYPICAL_UPPER_BOUNDARY()) && upperboundarypresent==false) {
1157
//        		upperboundary = smv.getValue();
1158
//        		upperboundarypresent=true;
1159
//        	}else if (type.equals(StatisticalMeasure.TYPICAL_LOWER_BOUNDARY()) && lowerboundarypresent==false) {
1160
//        		lowerboundary = smv.getValue();
1161
//        		lowerboundarypresent=true;
1162
//        	}else if (type.equals(StatisticalMeasure.AVERAGE()) && upperboundarypresent==false && lowerboundarypresent==false) {
1163
//        		lowerboundary = smv.getValue();
1164
//        		upperboundary = lowerboundary;
1165
//        		lowerboundarypresent=true;
1166
//        		upperboundarypresent=true;
1167
//        	}
1168
        }
1169
        if (lowerboundary != null && upperboundary != null) {
1170
        	allValues.add(lowerboundary);
1171
        	allValues.add(upperboundary);
1172
        }else if(lowerboundary != null || upperboundary != null){
1173
            logger.warn("Only one of upper or lower boundary is defined. Statistical measurement value not used.");
1174
        }
1175
    }
1176

    
1177
	/**
1178
	 * This function returns the score of a categorical feature
1179
	 * by comparing each taxon with each other. If the feature
1180
	 * discriminates a single pair of taxa the score is increased.
1181
	 */
1182
	private float categoricalFeatureScore(Feature feature, Set<KeyTaxon> coveredTaxa, Set<State> filter){
1183
		int i,j;
1184
		float score =0;
1185
		float power=0;
1186
		KeyTaxon[] coveredTaxaArray = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1187
		for (i=0 ; i<coveredTaxaArray.length; i++){
1188
			Set<CategoricalData> cd1 = coveredTaxaArray[i].getCategoricalData(feature);
1189
			for (j=i+1 ; j< coveredTaxaArray.length ; j++){
1190
			    Set<CategoricalData> cd2 = coveredTaxaArray[j].getCategoricalData(feature);
1191
				power = defaultCategoricalPower(cd1, cd2, filter);
1192
				score = score + power;
1193
			}
1194
		}
1195
		return score;
1196
	}
1197

    
1198
	/**
1199
	 * This recursive function fills the maps of dependencies by reading the tree containing the dependencies.
1200
	 */
1201
	private void createDependencies(TermNode<Feature> node){
1202

    
1203
	    //the featureDependencies handling was originally in defaultCategoricalPower(cat, cat)
1204
	    //needs to be checked if it is OK to handle them here
1205
        if (node.isDependent()){
1206
            Feature parentFeature = node.getParent().getTerm();
1207
            if (!featureDependencies.containsKey(parentFeature)){
1208
                featureDependencies.put(parentFeature, new HashSet<>());
1209
            }
1210
            for (FeatureState featureState : node.getOnlyApplicableIf()){
1211
                if (!oAifDependencies.containsKey(featureState)) {
1212
                    oAifDependencies.put(featureState, new HashSet<>());
1213
                }
1214
                oAifDependencies.get(featureState).add(node.getTerm());
1215
                //TODO: we only guess that it is the state of the parent feature here
1216
                //needs to be improved
1217
                featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1218
            }
1219
            for (FeatureState featureState : node.getInapplicableIf()){
1220
                if (!iAifDependencies.containsKey(featureState)) {
1221
                    iAifDependencies.put(featureState, new HashSet<>());
1222
                }
1223
                iAifDependencies.get(featureState).add(node.getTerm());
1224
                //TODO: we only guess that it is the state of the parent feature here
1225
                //needs to be improved
1226
                featureDependencies.get(node.getParent().getTerm()).add(node.getTerm());
1227
            }
1228
        }
1229

    
1230
		for (TermNode<Feature> fn : node.getChildNodes()){
1231
			createDependencies(fn);
1232
		}
1233
		System.out.println(featureDependencies);
1234
	}
1235

    
1236
	/**
1237
	 * This function fills the exclusions map.
1238
	 */
1239
	private float computeExclusions(Feature feature, Set<KeyTaxon> coveredTaxa, Map<State,Set<State>> exclusions, Set<State> filter){
1240
		//unclear what the score is fore here
1241
		float score =0;
1242
		float power=0;
1243
		KeyTaxon[] fixedOrderTaxa = coveredTaxa.toArray(new KeyTaxon[coveredTaxa.size()]); // I did not figure a better way to do this
1244
		for (int i=0 ; i<fixedOrderTaxa.length; i++){
1245
		    Set<CategoricalData> cd1 = fixedOrderTaxa[i].getCategoricalData(feature);
1246

    
1247
			for (int j=i+1 ; j< fixedOrderTaxa.length ; j++){
1248
				Set<CategoricalData> cd2 = fixedOrderTaxa[j].getCategoricalData(feature);
1249

    
1250
//				System.out.println(deb1 + "; " +deb2);
1251
				power = defaultCategoricalPower(cd1, cd2, filter);
1252
				score = score + power;
1253
				if (power >= 1.0){ // if there is no state in common between deb1 and deb2
1254

    
1255
					for (StateData statedata1 : getStateData(cd1)){
1256
						State state1 = statedata1.getState();
1257
						if (!exclusions.containsKey(state1)){
1258
							exclusions.put(state1, new HashSet<>());
1259
						}
1260
						for (StateData statedata2 : getStateData(cd2)){
1261
							State state2 = statedata2.getState();
1262
							if (!exclusions.containsKey(state2)){
1263
								exclusions.put(state2, new HashSet<>());
1264
							}
1265
							exclusions.get(state1).add(state2);
1266
							exclusions.get(state2).add(state1);
1267
						}
1268
					}
1269
				}
1270
			}
1271
		}
1272
		return score;
1273
	}
1274

    
1275
    private Set<StateData> getStateData(Set<CategoricalData> cds) {
1276
        Set<StateData> result = new HashSet<>();
1277
        for (CategoricalData cd : cds){
1278
            result.addAll(cd.getStateData());
1279
        }
1280
        return result;
1281
    }
1282

    
1283
    /**
1284
	 * Returns the score of a categorical feature.
1285
	 */
1286
	private float defaultCategoricalPower(Set<CategoricalData> cd1, Set<CategoricalData> cd2, Set<State> filter){
1287
	    if (cd1 == null || cd2 == null ||cd1.isEmpty() || cd2.isEmpty()){
1288
	        return 0;
1289
	    }
1290

    
1291
	    //FIXME see defaultCategoricalPower_old for additional code on dependencies
1292
	    //which has been removed here for now but might be important
1293
        //Now I moved it to #createDependencies. Therefore the below is maybe not needed
1294
	    //anymore but superfluent.
1295
	    //But the implementation at createDependencies is not fully correct yet
1296
	    //so I keep it here for now.
1297

    
1298
	    for (CategoricalData cd : cd1){
1299
	        if (!featureDependencies.containsKey(cd.getFeature())){
1300
	            featureDependencies.put(cd.getFeature(), new HashSet<>());
1301
	        }
1302
	        for (State state : getStates(cd, filter)){
1303
	            if (iAifDependencies.get(state)!=null) {
1304
	                featureDependencies.get(cd.getFeature()).addAll(iAifDependencies.get(state));
1305
	            }
1306
	            if (oAifDependencies.get(state)!=null) {
1307
	                featureDependencies.get(cd.getFeature()).addAll(oAifDependencies.get(state));
1308
	            }
1309
	        }
1310
	    }
1311

    
1312
	    //get all states of both categorical data
1313
        Set<State> states = getStates(cd1, cd2, filter);
1314
        if (states.size() == 0){
1315
            return 0;
1316
        }
1317

    
1318
	    int nDiscriminative = 0;
1319
	    for (State state : states){
1320
	        boolean hasState1 = hasState(state, cd1);
1321
	        boolean hasState2 = hasState(state, cd2);
1322
	        //if only 1 has the state than the state is discriminative
1323
	        if (! (hasState1&&hasState2)) {
1324
	            nDiscriminative++;
1325
            }
1326
	    }
1327
	    float result = (float)nDiscriminative/states.size();
1328
	    return result;
1329
	}
1330

    
1331
    private boolean hasState(State state, Set<CategoricalData> cds) {
1332
        boolean result = false;
1333
        for (CategoricalData cd : cds){
1334
            for (StateData stateData:cd.getStateData()){
1335
                result |= state.equals(stateData.getState());
1336
            }
1337
        }
1338
        return result;
1339
    }
1340

    
1341
    private Set<State> getStates(Set<CategoricalData> cdset1, Set<CategoricalData> cdset2, Set<State> filter) {
1342
        Set<State> result = new HashSet<>();
1343
        result.addAll(getStates(cdset1, filter));
1344
        result.addAll(getStates(cdset2, filter));
1345
        return result;
1346
    }
1347

    
1348
    private Set<State> getStates(Set<CategoricalData> cdset, Set<State> filter) {
1349
        Set<State> result = new HashSet<>();
1350
        for (CategoricalData cd : cdset){
1351
            result.addAll(getStates(cd, filter));
1352
        }
1353
        return result;
1354
    }
1355

    
1356
    private Set<State> getStates(CategoricalData cd, Set<State> filter) {
1357
        //TODO handle modifier
1358
        Set<State> result = new HashSet<>();
1359
        List<StateData> states = cd.getStateData();
1360
        for (StateData stateData:states){
1361
            State state = stateData.getState();
1362
            if (filter != null && !filter.contains(state)){
1363
                continue;
1364
            }
1365
            result.add(stateData.getState());
1366
        }
1367
        return result;
1368
    }
1369

    
1370
    //TODO keep as long as the handling of featureDependencies is not yet checked or handled in
1371
    //the new method defaultCategoricalPower()
1372
    private float defaultCategoricalPower_old(CategoricalData deb1, CategoricalData deb2){
1373
	        List<StateData> states1 = deb1.getStateData();
1374
	        List<StateData> states2 = deb2.getStateData();
1375
	        boolean bool = false;
1376
	        Iterator<StateData> stateData1Iterator = states1.iterator() ;
1377
	        //      while (!bool && stateData1Iterator.hasNext()) {
1378
	        //          Iterator<StateData> stateData2Iterator = states2.iterator() ;
1379
	        //          StateData stateData1 = stateData1Iterator.next();
1380
	        //          while (!bool && stateData2Iterator.hasNext()) {
1381
	        //              bool = stateData1.getState().equals(stateData2Iterator.next().getState()); // checks if the states are the same
1382
	        //          }
1383
	        //      }
1384
	        // one point each time two taxa can be discriminated for a given feature
1385

    
1386
	        boolean checkFeature = false;
1387

    
1388
	        if (!featureDependencies.containsKey(deb1.getFeature())){
1389
	            featureDependencies.put(deb1.getFeature(), new HashSet<>());
1390
	            checkFeature = true;
1391
	        }
1392

    
1393
	        while (stateData1Iterator.hasNext()) {
1394
	            Iterator<StateData> stateData2Iterator = states2.iterator() ;
1395
	            StateData stateData1 = stateData1Iterator.next();
1396
	            State state1 = stateData1.getState();
1397
	            if (checkFeature){
1398
	                if (iAifDependencies.get(state1)!=null) {
1399
	                    featureDependencies.get(deb1.getFeature()).addAll(iAifDependencies.get(state1));
1400
	                }
1401
	                if (oAifDependencies.get(state1)!=null) {
1402
	                    featureDependencies.get(deb1.getFeature()).addAll(oAifDependencies.get(state1));
1403
	                }
1404
	            }
1405
	            while (stateData2Iterator.hasNext()) {
1406
	                StateData stateData2 = stateData2Iterator.next();
1407
	                State state2 = stateData2.getState();
1408
	                bool = bool || state1.equals(state2); // checks if the states are the same
1409
	            }
1410
	        }
1411

    
1412
	        if (bool) {
1413
	            return 0;
1414
	        } else {
1415
	            return 1;
1416
	        }
1417
	    }
1418
}
(1-1/2)