ref #10178: new methods for normalization of names
authorBelen Escobari <b.escobari@bo.berlin>
Tue, 30 May 2023 13:29:32 +0000 (15:29 +0200)
committerBelen Escobari <b.escobari@bo.berlin>
Tue, 30 May 2023 13:30:38 +0000 (15:30 +0200)
cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtils.java
cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java [new file with mode: 0644]
cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsBelenTest.java [new file with mode: 0644]
cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsTest.java
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java [new file with mode: 0644]
cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplTest.java
cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java [new file with mode: 0644]

index e8bc1ea8abe92b9f61e13d6ce1a2827099657502..773bdb5629d232905fc57a86f61ece6d5201919c 100644 (file)
@@ -589,5 +589,43 @@ public class CdmUtils {
     public static boolean isNullSafeEmpty(Collection<?> collection) {
         return collection == null || collection.isEmpty();
     }
+    
+    public static int modifiedDamerauLevenshteinDistance(String str1, String str2) {
+               if (str1 == str2) {
+                       return 0;
+               } else if (str1.isEmpty()) {
+                       return str2.length();
+               } else if (str2.isEmpty()) {
+                       return str1.length();
+               } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
+                       return 1;
+               } else {
+
+                       int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
+
+                       for (int i = 0; i <= str1.length(); i++) {
+                               distanceMatrix[i][0] = i;
+                       }
+
+                       for (int j = 0; j <= str2.length(); j++) {
+                               distanceMatrix[0][j] = j;
+                       }
+
+                       for (int i = 1; i <= str1.length(); i++) {
+                               for (int j = 1; j <= str2.length(); j++) {
+                                       int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
+                                       distanceMatrix[i][j] = Math.min(
+                                                       Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
+                                                       distanceMatrix[i - 1][j - 1] + cost);
+
+                                       if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
+                                                       && str1.charAt(i - 2) == str2.charAt(j - 1)) {
+                                               distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
+                                       }
+                               }
+                       }
+                       return distanceMatrix[str1.length()][str2.length()];
+               }
+       }
 
 }
diff --git a/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java b/cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/CdmUtilsBelen.java
new file mode 100644 (file)
index 0000000..662b7cc
--- /dev/null
@@ -0,0 +1,92 @@
+package eu.etaxonomy.cdm.common;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class CdmUtilsBelen {
+
+//     Trim white spaces
+
+       public static String deleteEmptySpaces(String inputName) {
+               String outputName = inputName.replaceAll("\\s+", " ").trim();
+               return outputName;
+       }
+
+//     Replace characters with ASCII characters
+
+       public static String replaceSpecialCharacters(String str) {
+               String output;
+               output = str.replaceAll("[áåâãàä]", "a");
+               output = output.replaceAll("[éêèë]", "e");
+               output = output.replaceAll("[ôõøòóö]", "o");
+               output = output.replaceAll("[ìíîï]", "i");
+               output = output.replaceAll("[üûúù]", "u");
+               output = output.replaceAll("ñ", "n");
+               output = output.replaceAll("ç", "c");
+               return output;
+       }
+
+//     Change lists to lowercase
+
+       public static List <String> listToLowerCase(List<String> List) {
+               List <String> lowerCaseList = new ArrayList<>();
+               for (String x : List) {
+                       lowerCaseList.add(x.toLowerCase());
+               }
+               return lowerCaseList ;
+       }
+
+// Replace characters combinations that sound similar
+
+       public static String soundalike(String inputName) {
+               String[][] soundalike = {
+                               {"ae","e"},
+                               {"ia","a"},
+                               {"oe", "i"},
+                               {"oi", "a"},
+                               {"sc", "s"}
+                               };
+               for (int i = 0 ; i<soundalike.length;i++) {
+                       if (inputName.contains(soundalike[i][0])) {
+                       inputName = inputName.replace(soundalike[i][0],soundalike[i][1]);
+                       }
+               }
+               return inputName;
+       }
+
+//     Remove duplicated letters
+
+       public static String removeDuplicate(String input) {
+               char [] temp= input.toCharArray();
+               int lenght=temp.length;
+
+               int index = 0;
+               int p;
+               for (int i = 0; i < lenght- 1; i++) {
+                       p = i + 1;
+                       if (!(temp[i] == temp[p])) {
+                               temp[index++] = temp[i];
+                       }
+               }
+               String output = String.valueOf(Arrays.copyOf(temp, index));
+               output= output+ temp[lenght- 1];
+               return output;
+       }
+
+//     normalize ending ignoring gender issues
+
+       public static String replacerGenderEnding(String input) {
+
+               String firstPart= input.substring(0, input.length() - 2);
+               String lastTwoChar = input.substring((input.length() - 2), input.length());
+               String[] endingChar = new String[] { "is", "us", "ys", "es", "im", "as", "um", "os" };
+               for (String i : endingChar) {
+                       if (lastTwoChar.contains(i)) {
+                               lastTwoChar = lastTwoChar.replace(i, "a");
+                       }
+               }
+               String output = firstPart + lastTwoChar;
+               return output;
+       }
+}
diff --git a/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsBelenTest.java b/cdmlib-commons/src/test/java/eu/etaxonomy/cdm/common/CdmUtilsBelenTest.java
new file mode 100644 (file)
index 0000000..ea73f3c
--- /dev/null
@@ -0,0 +1,69 @@
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.common;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author andreabee90
+ * @since 30.05.2023
+ */
+public class CdmUtilsBelenTest {
+
+    @Test
+    public void testDeleteEmptySpaces() {
+        String name= "  Quercus  robur ";
+        Assert.assertEquals("Quercus robur", CdmUtilsBelen.deleteEmptySpaces(name));
+
+    }
+
+    @Test
+    public void testReplaceSpecialCharacters() {
+        String name= "áåâãàêèëôõøòóöìíîïüûúùñç";
+        Assert.assertEquals("aaaaaeeeooooooiiiiuuuunc",CdmUtilsBelen.replaceSpecialCharacters(name));
+    }
+
+    @Test
+    public void testListToLowerCase() {
+        List <String> testList= new ArrayList<>();
+        testList.add("NAME 1");
+        testList.add("nAmE 2");
+        Assert.assertEquals("name 1", CdmUtilsBelen.listToLowerCase(testList).get(0));
+        Assert.assertEquals("name 2", CdmUtilsBelen.listToLowerCase(testList).get(1));
+    }
+
+    @Test
+    public void testSoundalike() {
+        String name = "ae ia oe oi sc";
+        Assert.assertEquals("e a i a s", CdmUtilsBelen.soundalike(name));
+    }
+
+    @Test
+    public void testRemoveDuplicate() {
+        String name = "thiiss iss aa striiiing with duupliccaaaatess";
+        Assert.assertEquals("this is a string with duplicates", CdmUtilsBelen.removeDuplicate(name));
+    }
+
+    @Test
+    public void testReplacerGenderEnding() {
+//        String name="is";
+//        String name="us";
+//        String name="ys";
+//        String name="es";
+//        String name="im";
+        String name="as";
+//        String name="um";
+//        String name="os";
+        Assert.assertEquals("a", CdmUtilsBelen.replacerGenderEnding(name));
+    }
+}
index 8630671de3b99d6ac352b7ce5d5cdab3cf479958..247d79097859faa6c1675d625ad2fcfae75b5497 100644 (file)
@@ -8,6 +8,7 @@
  */
 package eu.etaxonomy.cdm.common;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
 import java.io.IOException;
@@ -131,4 +132,19 @@ public class CdmUtilsTest {
         Assert.assertEquals("Str1;Str3", CdmUtils.concat(";", str1, "", str3));
         Assert.assertEquals("Str1; ;Str3", CdmUtils.concat(";", str1, " ", str3));
     }
+    
+    @Test
+    public void testmodifiedDamerauLevenshteinDistance() {
+               
+       int distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
+       assertEquals(5,distance);
+       distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
+       assertEquals(7,distance);
+       distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
+       assertEquals(5,distance);
+       distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
+       assertEquals(1,distance);
+       distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
+       assertEquals(0,distance);
+    }
 }
\ No newline at end of file
index 5f2660ed035c27bce17756d2a1779617af17f393..1853790cb6d552fa616b0d67c9fbe8595c533b56 100644 (file)
@@ -52,6 +52,7 @@ import eu.etaxonomy.cdm.api.service.search.QueryFactory;
 import eu.etaxonomy.cdm.api.service.search.SearchResult;
 import eu.etaxonomy.cdm.api.service.search.SearchResultBuilder;
 import eu.etaxonomy.cdm.api.util.TaxonNamePartsFilter;
+import eu.etaxonomy.cdm.common.CdmUtils;
 import eu.etaxonomy.cdm.common.DoubleResult;
 import eu.etaxonomy.cdm.common.URI;
 import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
@@ -1311,23 +1312,32 @@ public class NameServiceImpl
     @Override
        public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, int maxDistanceGenus, int maxDisEpith, int limit) {
 
-//        maxDistanceGenus=3;//the default value in Rees algorithm is 70% of the lenght.
+       //0. Normalizing and parsing
 
-        //1. name parsing.
+//     TODO Remove all qualifiers such as cf., aff., ?, <i>, x, etc.
 
-               TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
+       TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
                String genusQuery = name.getGenusOrUninomial();
                String epithetQuery = name.getSpecificEpithet();
+
+               //1. Genus pre-filter
+
                String initial= genusQuery.substring(0,1) + "*";
+               List<String> tempGenusList = dao.distinctGenusOrUninomial(initial, null, null); //list of all genera in the database starting with the initial letter of the query
+               List<String> genusList= new ArrayList <>(); // compare the length of query and the length of the database name. When the difference is less than the variable "limit", add the genus into the list
 
-               List<String> genusList = dao.distinctGenusOrUninomial(initial, null, null); //list
+               for (String x:tempGenusList) {
+                   if (Math.abs(x.length()-genusQuery.length())<=limit) {
+                       genusList.add(x);
+                   }
+               }
                List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
 
                //2. comparison of:
 
                    //genus
                for (String genusNameInDB : genusList) {
-                   int distance = modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB);
+                   int distance = CdmUtils.modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB);
             if (distance <= maxDistanceGenus) {
                 List<TaxonNameParts> tempParts = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null);
                 for (TaxonNameParts namePart: tempParts) {
@@ -1339,14 +1349,13 @@ public class NameServiceImpl
                   //add epithet distance
                List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
                for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
-                   int epithetDistance = modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet());
+                   int epithetDistance = CdmUtils.modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet());
             if (epithetDistance <= maxDisEpith) {
                 epithetList.add(part);
                 part.setSecondResult(part.getSecondResult() + epithetDistance)  ;
 //                tempMap.add(part, fullTaxonNamePartsList.get(part) + epithetDistance); // need to check how the final distance is calculated
             }else {
                 //do nothing
-
                 //tempMap.remove(part);
             }
                }
@@ -1359,43 +1368,4 @@ public class NameServiceImpl
 //                              Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
         return epithetList.subList(0, Math.min(limit,epithetList.size()));
        }
-
-
-    public int modifiedDamerauLevenshteinDistance(String str1, String str2) {
-               if (str1 == str2) {
-                       return 0;
-               } else if (str1.isEmpty()) {
-                       return str2.length();
-               } else if (str2.isEmpty()) {
-                       return str1.length();
-               } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
-                       return 1;
-               } else {
-
-                       int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
-
-                       for (int i = 0; i <= str1.length(); i++) {
-                               distanceMatrix[i][0] = i;
-                       }
-
-                       for (int j = 0; j <= str2.length(); j++) {
-                               distanceMatrix[0][j] = j;
-                       }
-
-                       for (int i = 1; i <= str1.length(); i++) {
-                               for (int j = 1; j <= str2.length(); j++) {
-                                       int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
-                                       distanceMatrix[i][j] = Math.min(
-                                                       Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
-                                                       distanceMatrix[i - 1][j - 1] + cost);
-
-                                       if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
-                                                       && str1.charAt(i - 2) == str2.charAt(j - 1)) {
-                                               distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
-                                       }
-                               }
-                       }
-                       return distanceMatrix[str1.length()][str2.length()];
-               }
-       }
 }
\ No newline at end of file
diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelen.java
new file mode 100644 (file)
index 0000000..8f06a5f
--- /dev/null
@@ -0,0 +1,76 @@
+package eu.etaxonomy.cdm.api.service;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class NameServiceImplementBelen {
+       private String tempInputName;
+       private String tempDatabaseName;
+       private String shortenedInputName;
+       private String shortenedDatabaseName;
+
+// Phonetic changes performed ONLY on the initial characters of each String
+
+       public String replaceInitialCharacter(String inp) {
+           String input=inp.toLowerCase();
+               String output="";
+               String[][] phoneticChange = {
+                               {"ae","e"},{"cn","n"},{"ct","t"},{"cz","c"},
+                               {"dj","d"},{"ea","e"},{"eu","u"},{"gn","n"},
+                               {"kn","n"},{"mc","mac"},{"mn","n"},{"oe","e"},
+                               {"qu","q"},{"ph","f"},{"ps","s"},{"pt","t"},
+                               {"ts","s"},{"wr","r"},{"x","z"}
+                               };
+               for (int i = 0 ; i< phoneticChange.length; i++) {
+                       if (input.startsWith(phoneticChange[i][0])){
+                                       output= input.replaceFirst(phoneticChange[i][0], phoneticChange[i][1]);
+                                       break;
+                       }
+               }
+               return output;
+       }
+
+
+// trim common characters between query and document
+
+       public List <String> trimCommonChar(String inputName, String databaseName) {
+
+        // trim common leading characters of query and document
+
+        int inputNameLength = inputName.length();
+        int databaseNameLength = databaseName.length();
+        int largestString = Math.max(inputNameLength, databaseNameLength);
+        int i;
+
+        for (i = 0; i < largestString; i++) {
+            if (i >= inputNameLength || i >= databaseNameLength || inputName.charAt(i) != databaseName.charAt(i)) {
+                // Stop iterating when the characters at the current position are not equal.
+                break;
+            }
+        }
+
+        // Create temp names with common leading characters removed.
+        tempInputName = inputName.substring(i);
+        tempDatabaseName = databaseName.substring(i);
+
+        List <String> list= new ArrayList<>();
+
+        // trim common tailing characters between query and document
+
+        int restantInputNameLenght = tempInputName.length();
+        int restantDatabaseNameLenght = tempDatabaseName.length();
+        int shortestString = Math.min(restantInputNameLenght, restantDatabaseNameLenght);
+
+        for (int x = 0; x < shortestString; x++) {
+            if (tempInputName.charAt(restantInputNameLenght - x - 1) != tempDatabaseName
+                    .charAt(restantDatabaseNameLenght - x - 1)) {
+                break;
+            }
+            shortenedInputName = tempInputName.substring(0, restantInputNameLenght - x - 1);
+            shortenedDatabaseName = tempDatabaseName.substring(0, restantDatabaseNameLenght - x - 1);
+
+        }
+        list.add(shortenedInputName +" "+ shortenedDatabaseName);
+        return list;
+    }
+}
index cd205d6389babaabb6f4e16d33517251ebf674fd..e80834e80e233243f623ab14e9d2d61a504af7f5 100644 (file)
@@ -1099,20 +1099,6 @@ public class NameServiceImplTest extends CdmTransactionalIntegrationTest {
     @Override
     public void createTestDataSet() throws FileNotFoundException {}
 
-    @Test
-    public void testmodifiedDamerauLevenshteinDistance() {
-               NameServiceImpl name = new NameServiceImpl();
-       int distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
-       assertEquals(5,distance);
-       distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
-       assertEquals(7,distance);
-       distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
-       assertEquals(5,distance);
-       distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
-       assertEquals(1,distance);
-       distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
-       assertEquals(0,distance);
-    }
 
     @Test
     @DataSet(loadStrategy=CleanSweepInsertLoadStrategy.class, value="NameServiceImplTest.testFindMatchingNames.xml")
diff --git a/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java b/cdmlib-services/src/test/java/eu/etaxonomy/cdm/api/service/NameServiceImplementBelenTest.java
new file mode 100644 (file)
index 0000000..e7a4940
--- /dev/null
@@ -0,0 +1,47 @@
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.api.service;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author andreabee90
+ * @since 30.05.2023
+ */
+public class NameServiceImplementBelenTest {
+
+    @Test
+    public void testReplaceInitialCharacter() {
+        NameServiceImplementBelen test=new NameServiceImplementBelen();
+
+        String name = "euphorbia";
+        Assert.assertEquals("uphorbia", test.replaceInitialCharacter(name));
+        name = "Cnemidia";
+        Assert.assertEquals("nemidia", test.replaceInitialCharacter(name));
+        name = "Gnaphalium";
+        Assert.assertEquals("naphalium", test.replaceInitialCharacter(name));
+        name = "Philodendron";
+        Assert.assertEquals("filodendron", test.replaceInitialCharacter(name));
+        name = "Tsuga";
+        Assert.assertEquals("suga", test.replaceInitialCharacter(name));
+        name = "Czerniaevia";
+        Assert.assertEquals("cerniaevia", test.replaceInitialCharacter(name));
+    }
+
+    @Test
+    public void testTrimCommonChar() {
+        NameServiceImplementBelen test=new NameServiceImplementBelen();
+        String query ="this is a query string";
+        String document = "this is a database string";
+
+        Assert.assertEquals("query", test.trimCommonChar(query, document).get(0).toString().split(" ")[0]);
+        Assert.assertEquals("database", test.trimCommonChar(query, document).get(0).toString().split(" ")[1]);
+    }
+}