Revision 5f2f7f86
Added by Andreas Müller 11 months ago
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java | ||
---|---|---|
1311 | 1311 |
*/ |
1312 | 1312 |
//TODO work in progress |
1313 | 1313 |
@Override |
1314 |
public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, Integer maxDistanceGenus, Integer maxDisEpith) { |
|
1314 |
public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, |
|
1315 |
Integer maxDistanceGenus, Integer maxDisEpith) { |
|
1316 |
|
|
1317 |
if (maxDistanceGenus == null) { |
|
1318 |
maxDistanceGenus = 4; |
|
1319 |
} |
|
1315 | 1320 |
|
1316 | 1321 |
//0. Normalizing and parsing |
1317 | 1322 |
|
... | ... | |
1327 | 1332 |
// phonetic normalization of query (genus) |
1328 | 1333 |
|
1329 | 1334 |
String tempGenusPhon= NameServiceImplementBelen.replaceInitialCharacter(genusQuery); |
1330 |
String tempGenus = CdmUtilsBelen.normalized(tempGenusPhon);
|
|
1335 |
String normalizedGenusQuery = CdmUtilsBelen.normalized(tempGenusPhon);
|
|
1331 | 1336 |
|
1332 | 1337 |
|
1333 | 1338 |
//1. Genus pre-filter |
1334 | 1339 |
|
1335 |
// set a list with all names in DB starting with the first character of query |
|
1336 |
|
|
1337 |
String initial= tempGenus.substring(0,1).toUpperCase() + "*"; |
|
1338 |
List<String> tempGenusListNormal = dao.distinctGenusOrUninomial(initial, null, null); |
|
1339 |
|
|
1340 |
//set a list with all genera in the database starting with the initial letter of the PHONETIC TRANSFORMATION query |
|
1341 |
String initialPho = tempGenusPhon.substring(0,1).toUpperCase()+"*"; |
|
1342 |
List <String> tempGenusListPhon=new ArrayList<>(); |
|
1343 |
if (!initial.equals(initialPho)) { |
|
1344 |
tempGenusListPhon = dao.distinctGenusOrUninomial(initialPho, null, null); |
|
1345 |
} |
|
1346 |
|
|
1347 |
//add genera that have a phonetic match |
|
1348 |
List<String> genusList= new ArrayList <>(); |
|
1349 |
genusList.addAll(tempGenusListPhon); |
|
1340 |
List<String> preFilteredGenusList = nameMatchingGenusPrefilter(genusQuery, tempGenusPhon, normalizedGenusQuery); |
|
1350 | 1341 |
|
1351 |
// see word file Step 1. Rule 3. |
|
1352 |
|
|
1353 |
for (String x:tempGenusListNormal) { |
|
1354 |
|
|
1355 |
if (Math.abs(x.length()-genusQuery.length())<=2) { |
|
1356 |
|
|
1357 |
if(genusQuery.length()<5) { |
|
1358 |
|
|
1359 |
if (genusQuery.substring(0,1).equals(x.substring(0,1)) || |
|
1360 |
genusQuery.substring((genusQuery.length()-1),genusQuery.length()).equals(x.substring((x.length()-1),x.length()))) { |
|
1361 |
genusList.add(x); |
|
1362 |
} |
|
1363 |
} else if (genusQuery.length()==5) { |
|
1364 |
if (genusQuery.substring(0,2).equals(x.substring(0,2)) || |
|
1365 |
genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){ |
|
1366 |
genusList.add(x); |
|
1367 |
} |
|
1368 |
} else if (genusQuery.length()>5){ |
|
1369 |
if (genusQuery.substring(0,3).equals(x.substring(0,3)) || |
|
1370 |
genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){ |
|
1371 |
genusList.add(x); |
|
1372 |
} |
|
1373 |
} |
|
1374 |
} |
|
1375 |
} |
|
1376 | 1342 |
|
1377 |
//2. comparison of genus |
|
1343 |
//create result list |
|
1344 |
List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>(); |
|
1378 | 1345 |
|
1379 |
if (maxDistanceGenus==null) { |
|
1380 |
maxDistanceGenus=4; |
|
1381 |
} |
|
1346 |
for (String preFilteredGenus : preFilteredGenusList) { |
|
1382 | 1347 |
|
1383 |
String queryDocu; |
|
1384 |
List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>(); |
|
1348 |
//2. comparison of genus |
|
1385 | 1349 |
|
1386 |
for (String genusNameInDB:genusList) { |
|
1387 |
String genusNameInDBNor=NameServiceImplementBelen.replaceInitialCharacter(genusNameInDB); |
|
1388 |
genusNameInDBNor=CdmUtilsBelen.normalized(genusNameInDBNor); |
|
1389 |
if (NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNor).trim().isEmpty()) { |
|
1390 |
queryDocu=""; |
|
1391 |
} else { |
|
1392 |
queryDocu=NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNor); |
|
1393 |
} |
|
1350 |
String genusNameInitCharReplaced = NameServiceImplementBelen.replaceInitialCharacter(preFilteredGenus); |
|
1351 |
String genusNameInDBNormalized = CdmUtilsBelen.normalized(genusNameInitCharReplaced); |
|
1394 | 1352 |
|
1395 |
if (queryDocu=="") { |
|
1396 |
distance = 0; |
|
1397 |
} else { |
|
1398 |
String inputShort= queryDocu.split(" ")[0]; |
|
1399 |
String DbShort=queryDocu.split(" ")[1]; |
|
1400 |
distance = CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort); |
|
1401 |
} |
|
1353 |
distance = nameMatchingComputeDistance(normalizedGenusQuery, genusNameInDBNormalized); |
|
1402 | 1354 |
|
1403 |
//3. genus post-filter |
|
1404 |
int lengthTemp = genusQuery.length(); |
|
1405 |
int lengthDB=genusNameInDB.length(); |
|
1406 |
int half=Math.max(lengthTemp,lengthDB)/2; |
|
1355 |
//3. genus post-filter |
|
1407 | 1356 |
|
1408 |
//Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1 |
|
1409 |
if (distance <=maxDistanceGenus) { |
|
1410 |
List<TaxonNameParts> tempParts1 = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null); |
|
1411 |
for (TaxonNameParts namePart1: tempParts1) { |
|
1412 |
fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance)); |
|
1413 |
} |
|
1414 |
} else if(half<maxDistanceGenus && tempGenus.substring(0,1).equals(genusNameInDBNor.substring(0,1))) { |
|
1415 |
List<TaxonNameParts> tempParts2 = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null); |
|
1416 |
for (TaxonNameParts namePart2: tempParts2) { |
|
1417 |
fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance)); |
|
1418 |
} |
|
1419 |
} |
|
1357 |
nameMatchingPostGenusFilter(maxDistanceGenus, genusQuery, distance, normalizedGenusQuery, |
|
1358 |
fullTaxonNamePartsList, preFilteredGenus, genusNameInDBNormalized); |
|
1420 | 1359 |
} |
1421 | 1360 |
|
1422 | 1361 |
//if only genus is given |
... | ... | |
1435 | 1374 |
|
1436 | 1375 |
} else { |
1437 | 1376 |
|
1438 |
String tempEpithPhon= NameServiceImplementBelen.replaceInitialCharacter(epithetQuery); |
|
1439 |
String tempEpith= CdmUtilsBelen.normalized(tempEpithPhon); |
|
1440 |
|
|
1441 |
// 4. epithet pre-filter |
|
1442 |
List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList2 = new ArrayList<>(); |
|
1443 |
|
|
1444 |
for (DoubleResult<TaxonNameParts, Integer> nameX: fullTaxonNamePartsList) { |
|
1445 |
if (nameX.getFirstResult().getSpecificEpithet().length()-tempEpith.length()<=4) { |
|
1446 |
fullTaxonNamePartsList2.add(nameX); |
|
1447 |
fullTaxonNamePartsList=fullTaxonNamePartsList2; |
|
1448 |
} |
|
1449 |
} |
|
1450 |
// 5. comparison of epithet |
|
1451 |
if (maxDisEpith==null) { |
|
1452 |
maxDisEpith=4; |
|
1377 |
String tempEpithPhon = NameServiceImplementBelen.replaceInitialCharacter(epithetQuery); |
|
1378 |
String tempEpith = CdmUtilsBelen.normalized(tempEpithPhon); |
|
1379 |
|
|
1380 |
// 4. epithet pre-filter |
|
1381 |
List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList2 = new ArrayList<>(); |
|
1382 |
|
|
1383 |
for (DoubleResult<TaxonNameParts, Integer> nameX: fullTaxonNamePartsList) { |
|
1384 |
if (nameX.getFirstResult().getSpecificEpithet().length()-tempEpith.length()<=4) { |
|
1385 |
fullTaxonNamePartsList2.add(nameX); |
|
1386 |
fullTaxonNamePartsList=fullTaxonNamePartsList2; |
|
1387 |
} |
|
1388 |
} |
|
1389 |
|
|
1390 |
// 5. comparison of epithet |
|
1391 |
if (maxDisEpith==null) { |
|
1392 |
maxDisEpith=4; |
|
1393 |
} |
|
1394 |
|
|
1395 |
List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>(); |
|
1396 |
String queryDocu2; |
|
1397 |
for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) { |
|
1398 |
|
|
1399 |
String epithetInDB = part.getFirstResult().getSpecificEpithet(); |
|
1400 |
int lengthEpithetInDB=epithetInDB.length(); |
|
1401 |
int lengthEpithetQuery=epithetQuery.length(); |
|
1402 |
int half=Math.max(lengthEpithetInDB,lengthEpithetQuery)/2; |
|
1403 |
|
|
1404 |
String epithetinDBNorm=NameServiceImplementBelen.replaceInitialCharacter(epithetInDB); |
|
1405 |
|
|
1406 |
///aqui hay error cuando la base solo tiene genero sin epiteto |
|
1407 |
|
|
1408 |
epithetinDBNorm=CdmUtilsBelen.normalized(epithetinDBNorm); |
|
1409 |
if (NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm).trim().isEmpty()) { |
|
1410 |
queryDocu2=""; |
|
1411 |
} else { |
|
1412 |
queryDocu2=NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm); |
|
1413 |
} |
|
1414 |
|
|
1415 |
if (queryDocu2=="") { |
|
1416 |
epithetDistance=0; |
|
1417 |
} else { |
|
1418 |
String inputShort= queryDocu2.split(" ")[0]; |
|
1419 |
String DbShort=queryDocu2.split(" ")[1]; |
|
1420 |
epithetDistance= CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort); |
|
1421 |
} |
|
1422 |
|
|
1423 |
int totalDist = part.getSecondResult() + epithetDistance; |
|
1424 |
part.setSecondResult(totalDist) ; |
|
1425 |
|
|
1426 |
// 6. species post-filter |
|
1427 |
|
|
1428 |
if (totalDist <= maxDisEpith) { |
|
1429 |
epithetList.add(part); |
|
1430 |
}else if (half<maxDisEpith) { |
|
1431 |
if ((tempEpith.substring(0,1).equals(epithetInDB.substring(0,1)) |
|
1432 |
&& epithetDistance==2||epithetDistance==3)|| |
|
1433 |
(tempEpith.substring(0,3).equals(epithetInDB.substring(0,3)) |
|
1434 |
&& epithetDistance==4)) { |
|
1435 |
epithetList.add(part); |
|
1436 |
} |
|
1437 |
} |
|
1438 |
} |
|
1439 |
|
|
1440 |
// 6b Infraspecific comparison (pre-filter, comparison, post-filter) |
|
1441 |
//TODO |
|
1442 |
|
|
1443 |
// 7. Result shaping |
|
1444 |
|
|
1445 |
|
|
1446 |
Collections.sort(epithetList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()) ); |
|
1447 |
|
|
1448 |
List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(epithetList); |
|
1449 |
List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(epithetList); |
|
1450 |
|
|
1451 |
if(!exactResults.isEmpty()) { |
|
1452 |
return exactResults; |
|
1453 |
} else { |
|
1454 |
return bestResults; |
|
1455 |
} |
|
1453 | 1456 |
} |
1457 |
} |
|
1454 | 1458 |
|
1455 |
List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
|
|
1456 |
String queryDocu2;
|
|
1457 |
for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
|
|
1459 |
private void nameMatchingPostGenusFilter(Integer maxDistanceGenus, String genusQuery, int distance,
|
|
1460 |
String normalizedGenusQuery, List<DoubleResult<TaxonNameParts, Integer>> fullTaxonNamePartsList,
|
|
1461 |
String preFilteredGenus, String genusNameInDBNormalized) {
|
|
1458 | 1462 |
|
1459 |
String epithetInDB = part.getFirstResult().getSpecificEpithet(); |
|
1460 |
int lengthEpithetInDB=epithetInDB.length(); |
|
1461 |
int lengthEpithetQuery=epithetQuery.length(); |
|
1462 |
int half=Math.max(lengthEpithetInDB,lengthEpithetQuery)/2; |
|
1463 |
int genusQueryLength = genusQuery.length(); |
|
1464 |
int genusDBLength = preFilteredGenus.length(); |
|
1465 |
int halfLength = Math.max(genusQueryLength, genusDBLength)/2; |
|
1463 | 1466 |
|
1464 |
String epithetinDBNorm=NameServiceImplementBelen.replaceInitialCharacter(epithetInDB); |
|
1467 |
//Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1 |
|
1468 |
if (distance <= maxDistanceGenus) { |
|
1469 |
List<TaxonNameParts> tempParts1 = dao.findTaxonNameParts(Optional.of(preFilteredGenus), null, null, null, null, null, null, null, null); |
|
1470 |
for (TaxonNameParts namePart1: tempParts1) { |
|
1471 |
fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance)); |
|
1472 |
} |
|
1473 |
} else if(halfLength < maxDistanceGenus && normalizedGenusQuery.substring(0,1).equals(genusNameInDBNormalized.substring(0,1))) { |
|
1474 |
List<TaxonNameParts> tempParts2 = dao.findTaxonNameParts(Optional.of(preFilteredGenus),null, null, null, null, null, null, null, null); |
|
1475 |
for (TaxonNameParts namePart2: tempParts2) { |
|
1476 |
fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance)); |
|
1477 |
} |
|
1478 |
} |
|
1479 |
} |
|
1465 | 1480 |
|
1466 |
///aqui hay error cuando la base solo tiene genero sin epiteto |
|
1481 |
private int nameMatchingComputeDistance(String tempGenus, String genusNameInDBNormalized) { |
|
1482 |
int distance; |
|
1483 |
String queryDocu = NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNormalized); |
|
1467 | 1484 |
|
1468 |
epithetinDBNorm=CdmUtilsBelen.normalized(epithetinDBNorm); |
|
1469 |
if (NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm).trim().isEmpty()) { |
|
1470 |
queryDocu2=""; |
|
1471 |
} else { |
|
1472 |
queryDocu2=NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm); |
|
1473 |
} |
|
1485 |
if ("".equals(queryDocu)) { |
|
1486 |
distance = 0; |
|
1487 |
} else { |
|
1488 |
String inputShort= queryDocu.split(" ")[0]; |
|
1489 |
String DbShort=queryDocu.split(" ")[1]; |
|
1490 |
distance = CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort); |
|
1491 |
} |
|
1492 |
return distance; |
|
1493 |
} |
|
1474 | 1494 |
|
1475 |
if (queryDocu2=="") { |
|
1476 |
epithetDistance=0; |
|
1477 |
} else { |
|
1478 |
String inputShort= queryDocu2.split(" ")[0]; |
|
1479 |
String DbShort=queryDocu2.split(" ")[1]; |
|
1480 |
epithetDistance= CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort); |
|
1481 |
} |
|
1495 |
private List<String> nameMatchingGenusPrefilter(String genusQuery, String tempGenusPhon, String tempGenus) { |
|
1496 |
// set a list with all names in DB starting with the first character of query |
|
1497 |
String initial= tempGenus.substring(0,1).toUpperCase() + "*"; |
|
1498 |
List<String> tempGenusListNormal = dao.distinctGenusOrUninomial(initial, null, null); |
|
1482 | 1499 |
|
1483 |
int totalDist = part.getSecondResult() + epithetDistance; |
|
1484 |
part.setSecondResult(totalDist) ; |
|
1500 |
//set a list with all genera in the database starting with the initial letter |
|
1501 |
//of the PHONETIC TRANSFORMATION query |
|
1502 |
String initialPho = tempGenusPhon.substring(0,1).toUpperCase() + "*"; |
|
1503 |
List <String> tempGenusListPhon = new ArrayList<>(); |
|
1504 |
if (!initial.equals(initialPho)) { |
|
1505 |
tempGenusListPhon = dao.distinctGenusOrUninomial(initialPho, null, null); |
|
1506 |
} |
|
1485 | 1507 |
|
1486 |
// 6. species post-filter |
|
1508 |
//add genera that have a phonetic match |
|
1509 |
List<String> genusList = new ArrayList <>(); |
|
1510 |
genusList.addAll(tempGenusListPhon); |
|
1487 | 1511 |
|
1488 |
if (totalDist <= maxDisEpith) { |
|
1489 |
epithetList.add(part); |
|
1490 |
}else if (half<maxDisEpith) { |
|
1491 |
if ((tempEpith.substring(0,1).equals(epithetInDB.substring(0,1)) |
|
1492 |
&& epithetDistance==2||epithetDistance==3)|| |
|
1493 |
(tempEpith.substring(0,3).equals(epithetInDB.substring(0,3)) |
|
1494 |
&& epithetDistance==4)) { |
|
1495 |
epithetList.add(part); |
|
1496 |
} |
|
1497 |
} |
|
1498 |
} |
|
1512 |
// see word file Step 1. Rule 3. |
|
1499 | 1513 |
|
1500 |
Collections.sort(epithetList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()) );
|
|
1514 |
for (String x:tempGenusListNormal) {
|
|
1501 | 1515 |
|
1502 |
List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(epithetList); |
|
1503 |
List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(epithetList); |
|
1516 |
if (Math.abs(x.length()-genusQuery.length()) <= 2) { |
|
1504 | 1517 |
|
1505 |
if(!exactResults.isEmpty()) { |
|
1506 |
return exactResults; |
|
1507 |
} else { |
|
1508 |
return bestResults; |
|
1509 |
} |
|
1518 |
if(genusQuery.length()<5) { |
|
1519 |
|
|
1520 |
if (genusQuery.substring(0,1).equals(x.substring(0,1)) || |
|
1521 |
genusQuery.substring((genusQuery.length()-1),genusQuery.length()).equals(x.substring((x.length()-1),x.length()))) { |
|
1522 |
genusList.add(x); |
|
1523 |
} |
|
1524 |
} else if (genusQuery.length()==5) { |
|
1525 |
if (genusQuery.substring(0,2).equals(x.substring(0,2)) || |
|
1526 |
genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){ |
|
1527 |
genusList.add(x); |
|
1528 |
} |
|
1529 |
} else if (genusQuery.length()>5){ |
|
1530 |
if (genusQuery.substring(0,3).equals(x.substring(0,3)) || |
|
1531 |
genusQuery.substring((genusQuery.length()-3),genusQuery.length()).equals(x.substring((x.length()-3),x.length()))){ |
|
1532 |
genusList.add(x); |
|
1533 |
} |
|
1534 |
} |
|
1535 |
} |
|
1510 | 1536 |
} |
1511 |
} |
|
1537 |
return genusList; |
|
1538 |
} |
|
1512 | 1539 |
} |
Also available in: Unified diff
ref #10178 some cleanup for name matching (not finished yet)