Project

General

Profile

« Previous | Next » 

Revision 5f2f7f86

Added by Andreas Müller 11 months ago

ref #10178 some cleanup for name matching (not finished yet)

View differences:

cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java
1311 1311
     */
1312 1312
    //TODO work in progress
1313 1313
    @Override
1314
	public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, Integer maxDistanceGenus, Integer maxDisEpith) {
1314
	public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName,
1315
	        Integer maxDistanceGenus, Integer maxDisEpith) {
1316

  
1317
        if (maxDistanceGenus == null) {
1318
            maxDistanceGenus = 4;
1319
        }
1315 1320

  
1316 1321
    	//0. Normalizing and parsing
1317 1322

  
......
1327 1332
		// phonetic normalization of query (genus)
1328 1333

  
1329 1334
		String tempGenusPhon= NameServiceImplementBelen.replaceInitialCharacter(genusQuery);
1330
		String tempGenus = CdmUtilsBelen.normalized(tempGenusPhon);
1335
		String normalizedGenusQuery = CdmUtilsBelen.normalized(tempGenusPhon);
1331 1336

  
1332 1337

  
1333 1338
		//1. Genus pre-filter
1334 1339

  
1335
		// set a list with all names in DB starting with the first character of query
1336

  
1337
		String initial= tempGenus.substring(0,1).toUpperCase() + "*";
1338
		List<String> tempGenusListNormal = dao.distinctGenusOrUninomial(initial, null, null);
1339

  
1340
		//set a list with all genera in the database starting with the initial letter of the PHONETIC TRANSFORMATION query
1341
	    String initialPho = tempGenusPhon.substring(0,1).toUpperCase()+"*";
1342
	    List <String> tempGenusListPhon=new ArrayList<>();
1343
	    if (!initial.equals(initialPho)) {
1344
	        tempGenusListPhon = dao.distinctGenusOrUninomial(initialPho, null, null);
1345
	    }
1346

  
1347
	    //add genera that have a phonetic match
1348
	    List<String> genusList= new ArrayList <>();
1349
		genusList.addAll(tempGenusListPhon);
1340
		List<String> preFilteredGenusList = nameMatchingGenusPrefilter(genusQuery, tempGenusPhon, normalizedGenusQuery);
1350 1341

  
1351
		// see word file Step 1. Rule 3.
1352

  
1353
		for (String x:tempGenusListNormal) {
1354

  
1355
		    if (Math.abs(x.length()-genusQuery.length())<=2) {
1356

  
1357
		        if(genusQuery.length()<5) {
1358

  
1359
		            if (genusQuery.substring(0,1).equals(x.substring(0,1)) ||
1360
		                    genusQuery.substring((genusQuery.length()-1),genusQuery.length()).equals(x.substring((x.length()-1),x.length()))) {
1361
		                genusList.add(x);
1362
		            }
1363
		        } else if (genusQuery.length()==5) {
1364
		            if (genusQuery.substring(0,2).equals(x.substring(0,2)) ||
1365
		                    genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){
1366
		                genusList.add(x);
1367
		            }
1368
		        } else if (genusQuery.length()>5){
1369
		            if (genusQuery.substring(0,3).equals(x.substring(0,3)) ||
1370
		                    genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){
1371
		                genusList.add(x);
1372
		            }
1373
		        }
1374
		    }
1375
		}
1376 1342

  
1377
		//2. comparison of genus
1343
		//create result list
1344
		List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
1378 1345

  
1379
		if (maxDistanceGenus==null) {
1380
		    maxDistanceGenus=4;
1381
		}
1346
		for (String preFilteredGenus : preFilteredGenusList) {
1382 1347

  
1383
		String queryDocu;
1384
		List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
1348
		    //2. comparison of genus
1385 1349

  
1386
		for (String genusNameInDB:genusList) {
1387
		    String genusNameInDBNor=NameServiceImplementBelen.replaceInitialCharacter(genusNameInDB);
1388
		    genusNameInDBNor=CdmUtilsBelen.normalized(genusNameInDBNor);
1389
		    if (NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNor).trim().isEmpty()) {
1390
		        queryDocu="";
1391
		    } else {
1392
		        queryDocu=NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNor);
1393
		    }
1350
		    String genusNameInitCharReplaced = NameServiceImplementBelen.replaceInitialCharacter(preFilteredGenus);
1351
		    String genusNameInDBNormalized = CdmUtilsBelen.normalized(genusNameInitCharReplaced);
1394 1352

  
1395
		    if (queryDocu=="") {
1396
		        distance = 0;
1397
		    } else {
1398
		        String inputShort= queryDocu.split(" ")[0];
1399
		        String DbShort=queryDocu.split(" ")[1];
1400
		        distance = CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
1401
		    }
1353
		    distance = nameMatchingComputeDistance(normalizedGenusQuery, genusNameInDBNormalized);
1402 1354

  
1403
	    //3. genus post-filter
1404
		    int lengthTemp = genusQuery.length();
1405
		    int lengthDB=genusNameInDB.length();
1406
		    int half=Math.max(lengthTemp,lengthDB)/2;
1355
	        //3. genus post-filter
1407 1356

  
1408
	    //Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1
1409
		    if (distance <=maxDistanceGenus) {
1410
		        List<TaxonNameParts> tempParts1 = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null);
1411
                for (TaxonNameParts namePart1: tempParts1) {
1412
                    fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance));
1413
                }
1414
		    } else if(half<maxDistanceGenus && tempGenus.substring(0,1).equals(genusNameInDBNor.substring(0,1))) {
1415
		        List<TaxonNameParts> tempParts2 = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null);
1416
                for (TaxonNameParts namePart2: tempParts2) {
1417
                    fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance));
1418
                }
1419
		    }
1357
		    nameMatchingPostGenusFilter(maxDistanceGenus, genusQuery, distance, normalizedGenusQuery,
1358
                    fullTaxonNamePartsList, preFilteredGenus, genusNameInDBNormalized);
1420 1359
		}
1421 1360

  
1422 1361
		//if only genus is given
......
1435 1374

  
1436 1375
		} else {
1437 1376

  
1438
		    String tempEpithPhon= NameServiceImplementBelen.replaceInitialCharacter(epithetQuery);
1439
		    String tempEpith= CdmUtilsBelen.normalized(tempEpithPhon);
1440

  
1441
		// 4. epithet pre-filter
1442
		List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList2 = new ArrayList<>();
1443

  
1444
		for (DoubleResult<TaxonNameParts, Integer> nameX: fullTaxonNamePartsList) {
1445
		    if (nameX.getFirstResult().getSpecificEpithet().length()-tempEpith.length()<=4) {
1446
		        fullTaxonNamePartsList2.add(nameX);
1447
		        fullTaxonNamePartsList=fullTaxonNamePartsList2;
1448
		    }
1449
		}
1450
		// 5. comparison of epithet
1451
		if (maxDisEpith==null) {
1452
		    maxDisEpith=4;
1377
		    String tempEpithPhon = NameServiceImplementBelen.replaceInitialCharacter(epithetQuery);
1378
		    String tempEpith = CdmUtilsBelen.normalized(tempEpithPhon);
1379

  
1380
    		// 4. epithet pre-filter
1381
    		List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList2 = new ArrayList<>();
1382

  
1383
    		for (DoubleResult<TaxonNameParts, Integer> nameX: fullTaxonNamePartsList) {
1384
    		    if (nameX.getFirstResult().getSpecificEpithet().length()-tempEpith.length()<=4) {
1385
    		        fullTaxonNamePartsList2.add(nameX);
1386
    		        fullTaxonNamePartsList=fullTaxonNamePartsList2;
1387
    		    }
1388
    		}
1389

  
1390
    		// 5. comparison of epithet
1391
    		if (maxDisEpith==null) {
1392
    		    maxDisEpith=4;
1393
    		}
1394

  
1395
    		List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
1396
    		String queryDocu2;
1397
    		for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
1398

  
1399
    		    String epithetInDB = part.getFirstResult().getSpecificEpithet();
1400
    		    int lengthEpithetInDB=epithetInDB.length();
1401
    		    int lengthEpithetQuery=epithetQuery.length();
1402
    		    int half=Math.max(lengthEpithetInDB,lengthEpithetQuery)/2;
1403

  
1404
    		    String epithetinDBNorm=NameServiceImplementBelen.replaceInitialCharacter(epithetInDB);
1405

  
1406
    		    ///aqui hay error cuando la base solo tiene genero sin epiteto
1407

  
1408
    		    epithetinDBNorm=CdmUtilsBelen.normalized(epithetinDBNorm);
1409
    		    if (NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm).trim().isEmpty()) {
1410
    		        queryDocu2="";
1411
    		    } else {
1412
    		        queryDocu2=NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm);
1413
    		    }
1414

  
1415
                if (queryDocu2=="") {
1416
                    epithetDistance=0;
1417
    		    } else {
1418
    		        String inputShort= queryDocu2.split(" ")[0];
1419
                    String DbShort=queryDocu2.split(" ")[1];
1420
                    epithetDistance= CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
1421
    		    }
1422

  
1423
                int totalDist = part.getSecondResult() + epithetDistance;
1424
                part.setSecondResult(totalDist)  ;
1425

  
1426
    		// 6. species post-filter
1427

  
1428
    		    if (totalDist <= maxDisEpith) {
1429
                    epithetList.add(part);
1430
    		    }else if (half<maxDisEpith) {
1431
    		        if ((tempEpith.substring(0,1).equals(epithetInDB.substring(0,1))
1432
    		                && epithetDistance==2||epithetDistance==3)||
1433
    		                (tempEpith.substring(0,3).equals(epithetInDB.substring(0,3))
1434
    		                        && epithetDistance==4)) {
1435
    		            epithetList.add(part);
1436
    		        }
1437
    		    }
1438
    		}
1439

  
1440
    		// 6b Infraspecific comparison (pre-filter, comparison, post-filter)
1441
    		//TODO
1442

  
1443
    		// 7. Result shaping
1444

  
1445

  
1446
    		Collections.sort(epithetList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()) );
1447

  
1448
    		List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(epithetList);
1449
    		List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(epithetList);
1450

  
1451
    		if(!exactResults.isEmpty()) {
1452
    		    return exactResults;
1453
    		} else {
1454
    		    return bestResults;
1455
    		}
1453 1456
		}
1457
	}
1454 1458

  
1455
		List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
1456
		String queryDocu2;
1457
		for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
1459
    private void nameMatchingPostGenusFilter(Integer maxDistanceGenus, String genusQuery, int distance,
1460
            String normalizedGenusQuery, List<DoubleResult<TaxonNameParts, Integer>> fullTaxonNamePartsList,
1461
            String preFilteredGenus, String genusNameInDBNormalized) {
1458 1462

  
1459
		    String epithetInDB = part.getFirstResult().getSpecificEpithet();
1460
		    int lengthEpithetInDB=epithetInDB.length();
1461
		    int lengthEpithetQuery=epithetQuery.length();
1462
		    int half=Math.max(lengthEpithetInDB,lengthEpithetQuery)/2;
1463
        int genusQueryLength = genusQuery.length();
1464
        int genusDBLength = preFilteredGenus.length();
1465
        int halfLength = Math.max(genusQueryLength, genusDBLength)/2;
1463 1466

  
1464
		    String epithetinDBNorm=NameServiceImplementBelen.replaceInitialCharacter(epithetInDB);
1467
        //Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1
1468
        if (distance <= maxDistanceGenus) {
1469
            List<TaxonNameParts> tempParts1 = dao.findTaxonNameParts(Optional.of(preFilteredGenus), null, null, null, null, null, null, null, null);
1470
            for (TaxonNameParts namePart1: tempParts1) {
1471
                fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance));
1472
            }
1473
        } else if(halfLength < maxDistanceGenus && normalizedGenusQuery.substring(0,1).equals(genusNameInDBNormalized.substring(0,1))) {
1474
            List<TaxonNameParts> tempParts2 = dao.findTaxonNameParts(Optional.of(preFilteredGenus),null, null, null, null, null, null, null, null);
1475
            for (TaxonNameParts namePart2: tempParts2) {
1476
                fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance));
1477
            }
1478
        }
1479
    }
1465 1480

  
1466
		    ///aqui hay error cuando la base solo tiene genero sin epiteto
1481
    private int nameMatchingComputeDistance(String tempGenus, String genusNameInDBNormalized) {
1482
        int distance;
1483
        String queryDocu = NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNormalized);
1467 1484

  
1468
		    epithetinDBNorm=CdmUtilsBelen.normalized(epithetinDBNorm);
1469
		    if (NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm).trim().isEmpty()) {
1470
		        queryDocu2="";
1471
		    } else {
1472
		        queryDocu2=NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm);
1473
		    }
1485
        if ("".equals(queryDocu)) {
1486
            distance = 0;
1487
        } else {
1488
            String inputShort= queryDocu.split(" ")[0];
1489
            String DbShort=queryDocu.split(" ")[1];
1490
            distance = CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
1491
        }
1492
        return distance;
1493
    }
1474 1494

  
1475
            if (queryDocu2=="") {
1476
                epithetDistance=0;
1477
		    } else {
1478
		        String inputShort= queryDocu2.split(" ")[0];
1479
                String DbShort=queryDocu2.split(" ")[1];
1480
                epithetDistance= CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
1481
		    }
1495
    private List<String> nameMatchingGenusPrefilter(String genusQuery, String tempGenusPhon, String tempGenus) {
1496
        // set a list with all names in DB starting with the first character of query
1497
		String initial= tempGenus.substring(0,1).toUpperCase() + "*";
1498
		List<String> tempGenusListNormal = dao.distinctGenusOrUninomial(initial, null, null);
1482 1499

  
1483
            int totalDist = part.getSecondResult() + epithetDistance;
1484
            part.setSecondResult(totalDist)  ;
1500
		//set a list with all genera in the database starting with the initial letter
1501
		//of the PHONETIC TRANSFORMATION query
1502
	    String initialPho = tempGenusPhon.substring(0,1).toUpperCase() + "*";
1503
	    List <String> tempGenusListPhon = new ArrayList<>();
1504
	    if (!initial.equals(initialPho)) {
1505
	        tempGenusListPhon = dao.distinctGenusOrUninomial(initialPho, null, null);
1506
	    }
1485 1507

  
1486
		// 6. species post-filter
1508
	    //add genera that have a phonetic match
1509
	    List<String> genusList = new ArrayList <>();
1510
		genusList.addAll(tempGenusListPhon);
1487 1511

  
1488
		    if (totalDist <= maxDisEpith) {
1489
                epithetList.add(part);
1490
		    }else if (half<maxDisEpith) {
1491
		        if ((tempEpith.substring(0,1).equals(epithetInDB.substring(0,1))
1492
		                && epithetDistance==2||epithetDistance==3)||
1493
		                (tempEpith.substring(0,3).equals(epithetInDB.substring(0,3))
1494
		                        && epithetDistance==4)) {
1495
		            epithetList.add(part);
1496
		        }
1497
		    }
1498
		}
1512
		// see word file Step 1. Rule 3.
1499 1513

  
1500
		Collections.sort(epithetList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()) );
1514
		for (String x:tempGenusListNormal) {
1501 1515

  
1502
		List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(epithetList);
1503
		List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(epithetList);
1516
		    if (Math.abs(x.length()-genusQuery.length()) <= 2) {
1504 1517

  
1505
		if(!exactResults.isEmpty()) {
1506
		    return exactResults;
1507
		} else {
1508
		    return bestResults;
1509
		}
1518
		        if(genusQuery.length()<5) {
1519

  
1520
		            if (genusQuery.substring(0,1).equals(x.substring(0,1)) ||
1521
		                    genusQuery.substring((genusQuery.length()-1),genusQuery.length()).equals(x.substring((x.length()-1),x.length()))) {
1522
		                genusList.add(x);
1523
		            }
1524
		        } else if (genusQuery.length()==5) {
1525
		            if (genusQuery.substring(0,2).equals(x.substring(0,2)) ||
1526
		                    genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){
1527
		                genusList.add(x);
1528
		            }
1529
		        } else if (genusQuery.length()>5){
1530
		            if (genusQuery.substring(0,3).equals(x.substring(0,3)) ||
1531
		                    genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){
1532
		                genusList.add(x);
1533
		            }
1534
		        }
1535
		    }
1510 1536
		}
1511
	}
1537
        return genusList;
1538
    }
1512 1539
}

Also available in: Unified diff