public static boolean isNullSafeEmpty(Collection<?> collection) {
return collection == null || collection.isEmpty();
}
+
+ public static int modifiedDamerauLevenshteinDistance(String str1, String str2) {
+ if (str1 == str2) {
+ return 0;
+ } else if (str1.isEmpty()) {
+ return str2.length();
+ } else if (str2.isEmpty()) {
+ return str1.length();
+ } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
+ return 1;
+ } else {
+
+ int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
+
+ for (int i = 0; i <= str1.length(); i++) {
+ distanceMatrix[i][0] = i;
+ }
+
+ for (int j = 0; j <= str2.length(); j++) {
+ distanceMatrix[0][j] = j;
+ }
+
+ for (int i = 1; i <= str1.length(); i++) {
+ for (int j = 1; j <= str2.length(); j++) {
+ int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
+ distanceMatrix[i][j] = Math.min(
+ Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
+ distanceMatrix[i - 1][j - 1] + cost);
+
+ if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
+ && str1.charAt(i - 2) == str2.charAt(j - 1)) {
+ distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
+ }
+ }
+ }
+ return distanceMatrix[str1.length()][str2.length()];
+ }
+ }
}
--- /dev/null
+package eu.etaxonomy.cdm.common;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class CdmUtilsBelen {
+
+// Trim white spaces
+
+ public static String deleteEmptySpaces(String inputName) {
+ String outputName = inputName.replaceAll("\\s+", " ").trim();
+ return outputName;
+ }
+
+// Replace characters with ASCII characters
+
+ public static String replaceSpecialCharacters(String str) {
+ String output;
+ output = str.replaceAll("[áåâãàä]", "a");
+ output = output.replaceAll("[éêèë]", "e");
+ output = output.replaceAll("[ôõøòóö]", "o");
+ output = output.replaceAll("[ìíîï]", "i");
+ output = output.replaceAll("[üûúù]", "u");
+ output = output.replaceAll("ñ", "n");
+ output = output.replaceAll("ç", "c");
+ return output;
+ }
+
+// Change lists to lowercase
+
+ public static List <String> listToLowerCase(List<String> List) {
+ List <String> lowerCaseList = new ArrayList<>();
+ for (String x : List) {
+ lowerCaseList.add(x.toLowerCase());
+ }
+ return lowerCaseList ;
+ }
+
+// Replace characters combinations that sound similar
+
+ public static String soundalike(String inputName) {
+ String[][] soundalike = {
+ {"ae","e"},
+ {"ia","a"},
+ {"oe", "i"},
+ {"oi", "a"},
+ {"sc", "s"}
+ };
+ for (int i = 0 ; i<soundalike.length;i++) {
+ if (inputName.contains(soundalike[i][0])) {
+ inputName = inputName.replace(soundalike[i][0],soundalike[i][1]);
+ }
+ }
+ return inputName;
+ }
+
+// Remove duplicated letters
+
+ public static String removeDuplicate(String input) {
+ char [] temp= input.toCharArray();
+ int lenght=temp.length;
+
+ int index = 0;
+ int p;
+ for (int i = 0; i < lenght- 1; i++) {
+ p = i + 1;
+ if (!(temp[i] == temp[p])) {
+ temp[index++] = temp[i];
+ }
+ }
+ String output = String.valueOf(Arrays.copyOf(temp, index));
+ output= output+ temp[lenght- 1];
+ return output;
+ }
+
+// normalize ending ignoring gender issues
+
+ public static String replacerGenderEnding(String input) {
+
+ String firstPart= input.substring(0, input.length() - 2);
+ String lastTwoChar = input.substring((input.length() - 2), input.length());
+ String[] endingChar = new String[] { "is", "us", "ys", "es", "im", "as", "um", "os" };
+ for (String i : endingChar) {
+ if (lastTwoChar.contains(i)) {
+ lastTwoChar = lastTwoChar.replace(i, "a");
+ }
+ }
+ String output = firstPart + lastTwoChar;
+ return output;
+ }
+}
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.common;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author andreabee90
+ * @since 30.05.2023
+ */
+public class CdmUtilsBelenTest {
+
+ @Test
+ public void testDeleteEmptySpaces() {
+ String name= " Quercus robur ";
+ Assert.assertEquals("Quercus robur", CdmUtilsBelen.deleteEmptySpaces(name));
+
+ }
+
+ @Test
+ public void testReplaceSpecialCharacters() {
+ String name= "áåâãàêèëôõøòóöìíîïüûúùñç";
+ Assert.assertEquals("aaaaaeeeooooooiiiiuuuunc",CdmUtilsBelen.replaceSpecialCharacters(name));
+ }
+
+ @Test
+ public void testListToLowerCase() {
+ List <String> testList= new ArrayList<>();
+ testList.add("NAME 1");
+ testList.add("nAmE 2");
+ Assert.assertEquals("name 1", CdmUtilsBelen.listToLowerCase(testList).get(0));
+ Assert.assertEquals("name 2", CdmUtilsBelen.listToLowerCase(testList).get(1));
+ }
+
+ @Test
+ public void testSoundalike() {
+ String name = "ae ia oe oi sc";
+ Assert.assertEquals("e a i a s", CdmUtilsBelen.soundalike(name));
+ }
+
+ @Test
+ public void testRemoveDuplicate() {
+ String name = "thiiss iss aa striiiing with duupliccaaaatess";
+ Assert.assertEquals("this is a string with duplicates", CdmUtilsBelen.removeDuplicate(name));
+ }
+
+ @Test
+ public void testReplacerGenderEnding() {
+// String name="is";
+// String name="us";
+// String name="ys";
+// String name="es";
+// String name="im";
+ String name="as";
+// String name="um";
+// String name="os";
+ Assert.assertEquals("a", CdmUtilsBelen.replacerGenderEnding(name));
+ }
+}
*/
package eu.etaxonomy.cdm.common;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
Assert.assertEquals("Str1;Str3", CdmUtils.concat(";", str1, "", str3));
Assert.assertEquals("Str1; ;Str3", CdmUtils.concat(";", str1, " ", str3));
}
+
+ @Test
+ public void testmodifiedDamerauLevenshteinDistance() {
+
+ int distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
+ assertEquals(5,distance);
+ distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
+ assertEquals(7,distance);
+ distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
+ assertEquals(5,distance);
+ distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
+ assertEquals(1,distance);
+ distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
+ assertEquals(0,distance);
+ }
}
\ No newline at end of file
import eu.etaxonomy.cdm.api.service.search.SearchResult;
import eu.etaxonomy.cdm.api.service.search.SearchResultBuilder;
import eu.etaxonomy.cdm.api.util.TaxonNamePartsFilter;
+import eu.etaxonomy.cdm.common.CdmUtils;
import eu.etaxonomy.cdm.common.DoubleResult;
import eu.etaxonomy.cdm.common.URI;
import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
@Override
public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, int maxDistanceGenus, int maxDisEpith, int limit) {
-// maxDistanceGenus=3;//the default value in Rees algorithm is 70% of the lenght.
+ //0. Normalizing and parsing
- //1. name parsing.
+// TODO Remove all qualifiers such as cf., aff., ?, <i>, x, etc.
- TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
+ TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
String genusQuery = name.getGenusOrUninomial();
String epithetQuery = name.getSpecificEpithet();
+
+ //1. Genus pre-filter
+
String initial= genusQuery.substring(0,1) + "*";
+ List<String> tempGenusList = dao.distinctGenusOrUninomial(initial, null, null); //list of all genera in the database starting with the initial letter of the query
+ List<String> genusList= new ArrayList <>(); // compare the length of query and the length of the database name. When the difference is less than the variable "limit", add the genus into the list
- List<String> genusList = dao.distinctGenusOrUninomial(initial, null, null); //list
+ for (String x:tempGenusList) {
+ if (Math.abs(x.length()-genusQuery.length())<=limit) {
+ genusList.add(x);
+ }
+ }
List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
//2. comparison of:
//genus
for (String genusNameInDB : genusList) {
- int distance = modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB);
+ int distance = CdmUtils.modifiedDamerauLevenshteinDistance(genusQuery, genusNameInDB);
if (distance <= maxDistanceGenus) {
List<TaxonNameParts> tempParts = dao.findTaxonNameParts(Optional.of(genusNameInDB),null, null, null, null, null, null, null, null);
for (TaxonNameParts namePart: tempParts) {
//add epithet distance
List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
- int epithetDistance = modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet());
+ int epithetDistance = CdmUtils.modifiedDamerauLevenshteinDistance(epithetQuery, part.getFirstResult().getSpecificEpithet());
if (epithetDistance <= maxDisEpith) {
epithetList.add(part);
part.setSecondResult(part.getSecondResult() + epithetDistance) ;
// tempMap.add(part, fullTaxonNamePartsList.get(part) + epithetDistance); // need to check how the final distance is calculated
}else {
//do nothing
-
//tempMap.remove(part);
}
}
// Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
return epithetList.subList(0, Math.min(limit,epithetList.size()));
}
-
-
- public int modifiedDamerauLevenshteinDistance(String str1, String str2) {
- if (str1 == str2) {
- return 0;
- } else if (str1.isEmpty()) {
- return str2.length();
- } else if (str2.isEmpty()) {
- return str1.length();
- } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
- return 1;
- } else {
-
- int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
-
- for (int i = 0; i <= str1.length(); i++) {
- distanceMatrix[i][0] = i;
- }
-
- for (int j = 0; j <= str2.length(); j++) {
- distanceMatrix[0][j] = j;
- }
-
- for (int i = 1; i <= str1.length(); i++) {
- for (int j = 1; j <= str2.length(); j++) {
- int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
- distanceMatrix[i][j] = Math.min(
- Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
- distanceMatrix[i - 1][j - 1] + cost);
-
- if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
- && str1.charAt(i - 2) == str2.charAt(j - 1)) {
- distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
- }
- }
- }
- return distanceMatrix[str1.length()][str2.length()];
- }
- }
}
\ No newline at end of file
--- /dev/null
+package eu.etaxonomy.cdm.api.service;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class NameServiceImplementBelen {
+ private String tempInputName;
+ private String tempDatabaseName;
+ private String shortenedInputName;
+ private String shortenedDatabaseName;
+
+// Phonetic changes performed ONLY on the initial characters of each String
+
+ public String replaceInitialCharacter(String inp) {
+ String input=inp.toLowerCase();
+ String output="";
+ String[][] phoneticChange = {
+ {"ae","e"},{"cn","n"},{"ct","t"},{"cz","c"},
+ {"dj","d"},{"ea","e"},{"eu","u"},{"gn","n"},
+ {"kn","n"},{"mc","mac"},{"mn","n"},{"oe","e"},
+ {"qu","q"},{"ph","f"},{"ps","s"},{"pt","t"},
+ {"ts","s"},{"wr","r"},{"x","z"}
+ };
+ for (int i = 0 ; i< phoneticChange.length; i++) {
+ if (input.startsWith(phoneticChange[i][0])){
+ output= input.replaceFirst(phoneticChange[i][0], phoneticChange[i][1]);
+ break;
+ }
+ }
+ return output;
+ }
+
+
+// trim common characters between query and document
+
+ public List <String> trimCommonChar(String inputName, String databaseName) {
+
+ // trim common leading characters of query and document
+
+ int inputNameLength = inputName.length();
+ int databaseNameLength = databaseName.length();
+ int largestString = Math.max(inputNameLength, databaseNameLength);
+ int i;
+
+ for (i = 0; i < largestString; i++) {
+ if (i >= inputNameLength || i >= databaseNameLength || inputName.charAt(i) != databaseName.charAt(i)) {
+ // Stop iterating when the characters at the current position are not equal.
+ break;
+ }
+ }
+
+ // Create temp names with common leading characters removed.
+ tempInputName = inputName.substring(i);
+ tempDatabaseName = databaseName.substring(i);
+
+ List <String> list= new ArrayList<>();
+
+ // trim common tailing characters between query and document
+
+ int restantInputNameLenght = tempInputName.length();
+ int restantDatabaseNameLenght = tempDatabaseName.length();
+ int shortestString = Math.min(restantInputNameLenght, restantDatabaseNameLenght);
+
+ for (int x = 0; x < shortestString; x++) {
+ if (tempInputName.charAt(restantInputNameLenght - x - 1) != tempDatabaseName
+ .charAt(restantDatabaseNameLenght - x - 1)) {
+ break;
+ }
+ shortenedInputName = tempInputName.substring(0, restantInputNameLenght - x - 1);
+ shortenedDatabaseName = tempDatabaseName.substring(0, restantDatabaseNameLenght - x - 1);
+
+ }
+ list.add(shortenedInputName +" "+ shortenedDatabaseName);
+ return list;
+ }
+}
@Override
public void createTestDataSet() throws FileNotFoundException {}
- @Test
- public void testmodifiedDamerauLevenshteinDistance() {
- NameServiceImpl name = new NameServiceImpl();
- int distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
- assertEquals(5,distance);
- distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
- assertEquals(7,distance);
- distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
- assertEquals(5,distance);
- distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
- assertEquals(1,distance);
- distance = name.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
- assertEquals(0,distance);
- }
@Test
@DataSet(loadStrategy=CleanSweepInsertLoadStrategy.class, value="NameServiceImplTest.testFindMatchingNames.xml")
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.api.service;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author andreabee90
+ * @since 30.05.2023
+ */
+public class NameServiceImplementBelenTest {
+
+ @Test
+ public void testReplaceInitialCharacter() {
+ NameServiceImplementBelen test=new NameServiceImplementBelen();
+
+ String name = "euphorbia";
+ Assert.assertEquals("uphorbia", test.replaceInitialCharacter(name));
+ name = "Cnemidia";
+ Assert.assertEquals("nemidia", test.replaceInitialCharacter(name));
+ name = "Gnaphalium";
+ Assert.assertEquals("naphalium", test.replaceInitialCharacter(name));
+ name = "Philodendron";
+ Assert.assertEquals("filodendron", test.replaceInitialCharacter(name));
+ name = "Tsuga";
+ Assert.assertEquals("suga", test.replaceInitialCharacter(name));
+ name = "Czerniaevia";
+ Assert.assertEquals("cerniaevia", test.replaceInitialCharacter(name));
+ }
+
+ @Test
+ public void testTrimCommonChar() {
+ NameServiceImplementBelen test=new NameServiceImplementBelen();
+ String query ="this is a query string";
+ String document = "this is a database string";
+
+ Assert.assertEquals("query", test.trimCommonChar(query, document).get(0).toString().split(" ")[0]);
+ Assert.assertEquals("database", test.trimCommonChar(query, document).get(0).toString().split(" ")[1]);
+ }
+}