From: Belen Escobari Date: Tue, 30 May 2023 13:29:32 +0000 (+0200) Subject: ref #10178: new methods for normalization of names X-Git-Tag: 5.38.0^2~41 X-Git-Url: https://dev.e-taxonomy.eu/gitweb/cdmlib.git/commitdiff_plain/8ecd72152d2f6ccc5da39247230dabb8d13a1012?ds=inline ref #10178: new methods for normalization of names --- diff --git a/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtils.java b/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtils.java index e8bc1ea8ab..773bdb5629 100644 --- a/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtils.java +++ b/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtils.java @@ -589,5 +589,43 @@ public class CdmUtils { public static boolean isNullSafeEmpty(Collection collection) { return collection == null || collection.isEmpty(); } + + public static int modifiedDamerauLevenshteinDistance(String str1, String str2) { + if (str1 == str2) { + return 0; + } else if (str1.isEmpty()) { + return str2.length(); + } else if (str2.isEmpty()) { + return str1.length(); + } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) { + return 1; + } else { + + int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1]; + + for (int i = 0; i <= str1.length(); i++) { + distanceMatrix[i][0] = i; + } + + for (int j = 0; j <= str2.length(); j++) { + distanceMatrix[0][j] = j; + } + + for (int i = 1; i <= str1.length(); i++) { + for (int j = 1; j <= str2.length(); j++) { + int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1; + distanceMatrix[i][j] = Math.min( + Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1), + distanceMatrix[i - 1][j - 1] + cost); + + if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2) + && str1.charAt(i - 2) == str2.charAt(j - 1)) { + distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost); + } + } + } + return distanceMatrix[str1.length()][str2.length()]; + } + } } diff --git a/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java b/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java new file mode 100644 index 0000000000..662b7cc430 --- /dev/null +++ b/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java @@ -0,0 +1,92 @@ +package eu.etaxonomy.cdm.common; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CdmUtilsBelen { + +// Trim white spaces + + public static String deleteEmptySpaces(String inputName) { + String outputName = inputName.replaceAll("\\s+", " ").trim(); + return outputName; + } + +// Replace characters with ASCII characters + + public static String replaceSpecialCharacters(String str) { + String output; + output = str.replaceAll("[áåâãàä]", "a"); + output = output.replaceAll("[éêèë]", "e"); + output = output.replaceAll("[ôõøòóö]", "o"); + output = output.replaceAll("[ìíîï]", "i"); + output = output.replaceAll("[üûúù]", "u"); + output = output.replaceAll("ñ", "n"); + output = output.replaceAll("ç", "c"); + return output; + } + +// Change lists to lowercase + + public static List listToLowerCase(List List) { + List lowerCaseList = new ArrayList<>(); + for (String x : List) { + lowerCaseList.add(x.toLowerCase()); + } + return lowerCaseList ; + } + +// Replace characters combinations that sound similar + + public static String soundalike(String inputName) { + String[][] soundalike = { + {"ae","e"}, + {"ia","a"}, + {"oe", "i"}, + {"oi", "a"}, + {"sc", "s"} + }; + for (int i = 0 ; i testList= new ArrayList<>(); + testList.add("NAME 1"); + testList.add("nAmE 2"); + Assert.assertEquals("name 1", CdmUtilsBelen.listToLowerCase(testList).get(0)); + Assert.assertEquals("name 2", CdmUtilsBelen.listToLowerCase(testList).get(1)); + } + + @Test + public void testSoundalike() { + String name = "ae ia oe oi sc"; + Assert.assertEquals("e a i a s", CdmUtilsBelen.soundalike(name)); + } + + @Test + public void testRemoveDuplicate() { + String name = "thiiss iss aa striiiing with duupliccaaaatess"; + Assert.assertEquals("this is a string with duplicates", CdmUtilsBelen.removeDuplicate(name)); + } + + @Test + public void testReplacerGenderEnding() { +// String name="is"; +// String name="us"; +// String name="ys"; +// String name="es"; +// String name="im"; + String name="as"; +// String name="um"; +// String name="os"; + Assert.assertEquals("a", CdmUtilsBelen.replacerGenderEnding(name)); + } +} diff --git a/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsTest.java b/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsTest.java index 8630671de3..247d790978 100644 --- a/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsTest.java +++ b/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsTest.java @@ -8,6 +8,7 @@ */ package eu.etaxonomy.cdm.common; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import java.io.IOException; @@ -131,4 +132,19 @@ public class CdmUtilsTest { Assert.assertEquals("Str1;Str3", CdmUtils.concat(";", str1, "", str3)); Assert.assertEquals("Str1; ;Str3", CdmUtils.concat(";", str1, " ", str3)); } + + @Test + public void testmodifiedDamerauLevenshteinDistance() { + + int distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha"); + assertEquals(5,distance); + distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha"); + assertEquals(7,distance); + distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha"); + assertEquals(5,distance); + distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha"); + assertEquals(1,distance); + distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha"); + assertEquals(0,distance); + } } \ No newline at end of file diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java index 5f2660ed03..1853790cb6 100644 --- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java +++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java @@ -52,6 +52,7 @@ import eu.etaxonomy.cdm.api.service.search.QueryFactory; import eu.etaxonomy.cdm.api.service.search.SearchResult; import eu.etaxonomy.cdm.api.service.search.SearchResultBuilder; import eu.etaxonomy.cdm.api.util.TaxonNamePartsFilter; +import eu.etaxonomy.cdm.common.CdmUtils; import eu.etaxonomy.cdm.common.DoubleResult; import eu.etaxonomy.cdm.common.URI; import eu.etaxonomy.cdm.common.monitor.IProgressMonitor; @@ -1311,23 +1312,32 @@ public class NameServiceImpl @Override public List> findMatchingNames(String taxonName, int maxDistanceGenus, int maxDisEpith, int limit) { -// maxDistanceGenus=3;//the default value in Rees algorithm is 70% of the lenght. + //0. Normalizing and parsing - //1. name parsing. +// TODO Remove all qualifiers such as cf., aff., ?, , x, etc. - TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName); + TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName); String genusQuery = name.getGenusOrUninomial(); String epithetQuery = name.getSpecificEpithet(); + + //1. Genus pre-filter + String initial= genusQuery.substring(0,1) + "*"; + List tempGenusList = dao.distinctGenusOrUninomial(initial, null, null); //list of all genera in the database starting with the initial letter of the query + List genusList= new ArrayList <>(); // compare the length of query and the length of the database name. When the difference is less than the variable "limit", add the genus into the list - List genusList = dao.distinctGenusOrUninomial(initial, null, null); //list + for (String x:tempGenusList) { + if (Math.abs(x.length()-genusQuery.length())<=limit) { + genusList.add(x); + } + } List> fullTaxonNamePartsList = new ArrayList<>(); //2. comparison of: //genus for (String genusNameInDB : genusList) { - int distance = modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB); + int distance = CdmUtils.modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB); if (distance <= maxDistanceGenus) { List tempParts = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null); for (TaxonNameParts namePart: tempParts) { @@ -1339,14 +1349,13 @@ public class NameServiceImpl //add epithet distance List > epithetList = new ArrayList<>(); for (DoubleResult part: fullTaxonNamePartsList) { - int epithetDistance = modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet()); + int epithetDistance = CdmUtils.modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet()); if (epithetDistance <= maxDisEpith) { epithetList.add(part); part.setSecondResult(part.getSecondResult() + epithetDistance) ; // tempMap.add(part, fullTaxonNamePartsList.get(part) + epithetDistance); // need to check how the final distance is calculated }else { //do nothing - //tempMap.remove(part); } } @@ -1359,43 +1368,4 @@ public class NameServiceImpl // Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new)); return epithetList.subList(0, Math.min(limit,epithetList.size())); } - - - public int modifiedDamerauLevenshteinDistance(String str1, String str2) { - if (str1 == str2) { - return 0; - } else if (str1.isEmpty()) { - return str2.length(); - } else if (str2.isEmpty()) { - return str1.length(); - } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) { - return 1; - } else { - - int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1]; - - for (int i = 0; i <= str1.length(); i++) { - distanceMatrix[i][0] = i; - } - - for (int j = 0; j <= str2.length(); j++) { - distanceMatrix[0][j] = j; - } - - for (int i = 1; i <= str1.length(); i++) { - for (int j = 1; j <= str2.length(); j++) { - int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1; - distanceMatrix[i][j] = Math.min( - Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1), - distanceMatrix[i - 1][j - 1] + cost); - - if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2) - && str1.charAt(i - 2) == str2.charAt(j - 1)) { - distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost); - } - } - } - return distanceMatrix[str1.length()][str2.length()]; - } - } } \ No newline at end of file diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java new file mode 100644 index 0000000000..8f06a5f58d --- /dev/null +++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java @@ -0,0 +1,76 @@ +package eu.etaxonomy.cdm.api.service; + +import java.util.ArrayList; +import java.util.List; + +public class NameServiceImplementBelen { + private String tempInputName; + private String tempDatabaseName; + private String shortenedInputName; + private String shortenedDatabaseName; + +// Phonetic changes performed ONLY on the initial characters of each String + + public String replaceInitialCharacter(String inp) { + String input=inp.toLowerCase(); + String output=""; + String[][] phoneticChange = { + {"ae","e"},{"cn","n"},{"ct","t"},{"cz","c"}, + {"dj","d"},{"ea","e"},{"eu","u"},{"gn","n"}, + {"kn","n"},{"mc","mac"},{"mn","n"},{"oe","e"}, + {"qu","q"},{"ph","f"},{"ps","s"},{"pt","t"}, + {"ts","s"},{"wr","r"},{"x","z"} + }; + for (int i = 0 ; i< phoneticChange.length; i++) { + if (input.startsWith(phoneticChange[i][0])){ + output= input.replaceFirst(phoneticChange[i][0], phoneticChange[i][1]); + break; + } + } + return output; + } + + +// trim common characters between query and document + + public List trimCommonChar(String inputName, String databaseName) { + + // trim common leading characters of query and document + + int inputNameLength = inputName.length(); + int databaseNameLength = databaseName.length(); + int largestString = Math.max(inputNameLength, databaseNameLength); + int i; + + for (i = 0; i < largestString; i++) { + if (i >= inputNameLength || i >= databaseNameLength || inputName.charAt(i) != databaseName.charAt(i)) { + // Stop iterating when the characters at the current position are not equal. + break; + } + } + + // Create temp names with common leading characters removed. + tempInputName = inputName.substring(i); + tempDatabaseName = databaseName.substring(i); + + List list= new ArrayList<>(); + + // trim common tailing characters between query and document + + int restantInputNameLenght = tempInputName.length(); + int restantDatabaseNameLenght = tempDatabaseName.length(); + int shortestString = Math.min(restantInputNameLenght, restantDatabaseNameLenght); + + for (int x = 0; x < shortestString; x++) { + if (tempInputName.charAt(restantInputNameLenght - x - 1) != tempDatabaseName + .charAt(restantDatabaseNameLenght - x - 1)) { + break; + } + shortenedInputName = tempInputName.substring(0, restantInputNameLenght - x - 1); + shortenedDatabaseName = tempDatabaseName.substring(0, restantDatabaseNameLenght - x - 1); + + } + list.add(shortenedInputName +" "+ shortenedDatabaseName); + return list; + } +} diff --git a/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplTest.java b/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplTest.java index cd205d6389..e80834e80e 100644 --- a/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplTest.java +++ b/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplTest.java @@ -1099,20 +1099,6 @@ public class NameServiceImplTest extends CdmTransactionalIntegrationTest { @Override public void createTestDataSet() throws FileNotFoundException {} - @Test - public void testmodifiedDamerauLevenshteinDistance() { - NameServiceImpl name = new NameServiceImpl(); - int distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha"); - assertEquals(5,distance); - distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha"); - assertEquals(7,distance); - distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha"); - assertEquals(5,distance); - distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha"); - assertEquals(1,distance); - distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha"); - assertEquals(0,distance); - } @Test @DataSet(loadStrategy=CleanSweepInsertLoadStrategy.class, value="NameServiceImplTest.testFindMatchingNames.xml") diff --git a/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java b/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java new file mode 100644 index 0000000000..e7a4940257 --- /dev/null +++ b/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java @@ -0,0 +1,47 @@ +/** +* Copyright (C) 2023 EDIT +* European Distributed Institute of Taxonomy +* http://www.e-taxonomy.eu +* +* The contents of this file are subject to the Mozilla Public License Version 1.1 +* See LICENSE.TXT at the top of this package for the full license terms. +*/ +package eu.etaxonomy.cdm.api.service; + +import org.junit.Assert; +import org.junit.Test; + +/** + * @author andreabee90 + * @since 30.05.2023 + */ +public class NameServiceImplementBelenTest { + + @Test + public void testReplaceInitialCharacter() { + NameServiceImplementBelen test=new NameServiceImplementBelen(); + + String name = "euphorbia"; + Assert.assertEquals("uphorbia", test.replaceInitialCharacter(name)); + name = "Cnemidia"; + Assert.assertEquals("nemidia", test.replaceInitialCharacter(name)); + name = "Gnaphalium"; + Assert.assertEquals("naphalium", test.replaceInitialCharacter(name)); + name = "Philodendron"; + Assert.assertEquals("filodendron", test.replaceInitialCharacter(name)); + name = "Tsuga"; + Assert.assertEquals("suga", test.replaceInitialCharacter(name)); + name = "Czerniaevia"; + Assert.assertEquals("cerniaevia", test.replaceInitialCharacter(name)); + } + + @Test + public void testTrimCommonChar() { + NameServiceImplementBelen test=new NameServiceImplementBelen(); + String query ="this is a query string"; + String document = "this is a database string"; + + Assert.assertEquals("query", test.trimCommonChar(query, document).get(0).toString().split(" ")[0]); + Assert.assertEquals("database", test.trimCommonChar(query, document).get(0).toString().split(" ")[1]); + } +}