public static boolean isNullSafeEmpty(Collection<?> collection) {
return collection == null || collection.isEmpty();
}
-
- public static int modifiedDamerauLevenshteinDistance(String str1, String str2) {
- if (str1 == str2) {
- return 0;
- } else if (str1.isEmpty()) {
- return str2.length();
- } else if (str2.isEmpty()) {
- return str1.length();
- } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
- return 1;
- } else {
-
- int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
-
- for (int i = 0; i <= str1.length(); i++) {
- distanceMatrix[i][0] = i;
- }
-
- for (int j = 0; j <= str2.length(); j++) {
- distanceMatrix[0][j] = j;
- }
-
- for (int i = 1; i <= str1.length(); i++) {
- for (int j = 1; j <= str2.length(); j++) {
- int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
- distanceMatrix[i][j] = Math.min(
- Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
- distanceMatrix[i - 1][j - 1] + cost);
-
- if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
- && str1.charAt(i - 2) == str2.charAt(j - 1)) {
- distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
- }
- }
- }
- return distanceMatrix[str1.length()][str2.length()];
- }
- }
}
+++ /dev/null
-package eu.etaxonomy.cdm.common;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-public class CdmUtilsBelen {
-
-// Trim white spaces
-
- public static String deleteEmptySpaces(String inputName) {
- String outputName = inputName.replaceAll("\\s+", " ").trim();
- return outputName;
- }
-
-// Replace characters with ASCII characters
-
- public static String replaceSpecialCharacters(String str) {
- String output;
- output = str.replaceAll("[áåâãàä]", "a");
- output = output.replaceAll("[éêèë]", "e");
- output = output.replaceAll("[ôõøòóö]", "o");
- output = output.replaceAll("[ìíîï]", "i");
- output = output.replaceAll("[üûúù]", "u");
- output = output.replaceAll("ñ", "n");
- output = output.replaceAll("ç", "c");
- return output;
- }
-
-// Change lists to lowercase
-
- public static List <String> listToLowerCase(List<String> List) {
- List <String> lowerCaseList = new ArrayList<>();
- for (String x : List) {
- lowerCaseList.add(x.toLowerCase());
- }
- return lowerCaseList ;
- }
-
-// Replace characters combinations that sound similar
-
- public static String soundalike(String inputName) {
- String[][] soundalike = {
- {"ae","e"},
- {"ia","a"},
- {"oe", "i"},
- {"oi", "a"},
- {"sc", "s"}
- };
- for (int i = 0 ; i<soundalike.length;i++) {
- if (inputName.contains(soundalike[i][0])) {
- inputName = inputName.replace(soundalike[i][0],soundalike[i][1]);
- }
- }
- return inputName;
- }
-
-// Remove duplicated letters
-
- public static String removeDuplicate(String input) {
- char [] temp= input.toCharArray();
- int lenght=temp.length;
-
- int index = 0;
- int p;
- for (int i = 0; i < lenght- 1; i++) {
- p = i + 1;
- if (!(temp[i] == temp[p])) {
- temp[index++] = temp[i];
- }
- }
- String output = String.valueOf(Arrays.copyOf(temp, index));
- output= output+ temp[lenght- 1];
- return output;
- }
-
-// normalize ending ignoring gender issues
-
- public static String replacerGenderEnding(String input) {
-
- String firstPart= input.substring(0, input.length() - 2);
- String lastTwoChar = input.substring((input.length() - 2), input.length());
- String[] endingChar = new String[] { "is", "us", "ys", "es", "im", "as", "um", "os" };
- for (String i : endingChar) {
- if (lastTwoChar.contains(i)) {
- lastTwoChar = lastTwoChar.replace(i, "a");
- }
- }
- String output = firstPart + lastTwoChar;
- return output;
- }
-
- public static String normalize(String str) {
- String result;
-
- result=CdmUtilsBelen.replaceSpecialCharacters(str);
- result=CdmUtilsBelen.soundalike(result);
- result=CdmUtilsBelen.replacerGenderEnding(result);
-// tempGenus =NameServiceImplementBelen.replaceInitialCharacter(tempGenus);
- result=CdmUtilsBelen.removeDuplicate(result);
-
- return result;
- }
-}
--- /dev/null
+package eu.etaxonomy.cdm.common;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class NameMatchingUtils {
+
+// Delete known text elements such as cf. aff. and subgenera if it is enclosed in brackets
+
+ public static String removeExtraElements(String str) {
+ str = str.toUpperCase();
+ String[] extraElements = {" AFF. ", " AFF ", " SUBSP. ",
+ " SUBSP. ", " SP ", " SP. ",
+ " SPP. ", " SPP ", " CF. ",
+ " CF "};
+
+ String [] parsedFullString = str.split(" ");
+ if (parsedFullString[1].contains("(")) {
+ str = str.replace(parsedFullString[1], "");
+ }
+ str = deleteEmptySpaces(str);
+
+ for (int j = 0; j < extraElements.length; j++) {
+ if (str.contains(extraElements[j])) {
+ str = str.replace(extraElements[j], " ");
+ }
+ }
+ return str;
+ }
+
+// Delete HTML ampersand
+
+ public static String removeHTMLAmpersand (String str) {
+ if (str == null || str.isEmpty() || str.trim().isEmpty()) {
+ return "";
+ } else {
+ str = str.trim();
+
+ if (str.contains("&")) {
+ str = str.replace("&", "&");
+ }
+ if (str.contains("&")) {
+ str = str.replace("&", "&");
+ }
+
+ if (str.contains("<") && str.contains(">")) {
+ String firstStrPart = null;
+ String secondStrPart = null;
+
+ while (str.contains("<") && str.contains(">")) {
+ firstStrPart = str.substring(0, str.indexOf("<"));
+ secondStrPart = str.substring(str.indexOf(">") + 1);
+ str= (firstStrPart + " " + secondStrPart).replace(" ", " ");
+ }
+ }
+ return str;
+ }
+ }
+// Trim white spaces
+
+ public static String deleteEmptySpaces(String inputName) {
+ String outputName = inputName.replaceAll("\\s+", " ").trim();
+ return outputName;
+ }
+
+// Replace characters with ASCII characters
+
+ public static String replaceSpecialCharacters(String str) {
+ String output = str.toUpperCase();
+ output = output.replaceAll("[ÁÅÂÃÀÄ]", "A");
+ output = output.replaceAll("[ÉÊÈË]", "E");
+ output = output.replaceAll("[ÔÕØÒÓÖ]", "O");
+ output = output.replaceAll("[ÌÍÎÏ]", "I");
+ output = output.replaceAll("[ÜÛÚÙ]", "U");
+ output = output.replaceAll("Ñ", "N");
+ output = output.replaceAll("Ç", "C");
+ return output;
+ }
+
+// Change lists to uppercase
+
+ public static List <String> listToUpperCase(List<String> List) {
+ List <String> upperCaseList = new ArrayList<>();
+ for (String listElement : List) {
+ upperCaseList.add(listElement.toUpperCase());
+ }
+ return upperCaseList ;
+ }
+
+// Phonetic changes performed ONLY on the initial characters of each String
+
+ public static String replaceInitialCharacter(String input) {
+ String output = input.toUpperCase();
+ String[][] phoneticChange = {
+ {"AE", "E"},
+ {"CN", "N"},
+ {"CT", "T"},
+ {"CZ", "C"},
+ {"DJ", "D"},
+ {"EA", "E"},
+ {"EU", "U"},
+ {"GN", "N"},
+ {"KN", "N"},
+ {"MC", "MAC"},
+ {"MN", "N"},
+ {"OE", "E"},
+ {"QU", "Q"},
+ {"PH", "F"},
+ {"PS", "S"},
+ {"PT", "T"},
+ {"TS", "S"},
+ {"WR", "R"},
+ {"X", "Z"}
+ };
+ for (int i = 0 ; i < phoneticChange.length; i++) {
+ if (output.startsWith(phoneticChange[i][0])){
+ output= output.replaceFirst(phoneticChange[i][0], phoneticChange[i][1]);
+ break;
+ }
+ }
+ return output;
+ }
+
+// Replace characters combinations that sound similar
+
+ public static String soundalike(String inputName) {
+ inputName = inputName.toUpperCase();
+ String[][] soundalike = {
+ {"AE", "E"},
+ {"IA", "A"},
+ {"OE", "I"},
+ {"OI", "A"},
+ {"SC", "S"}
+ };
+ for (int i = 0; i < soundalike.length; i++) {
+ if (inputName.contains(soundalike[i][0])) {
+ inputName = inputName.replace(soundalike[i][0],soundalike[i][1]);
+ }
+ }
+ return inputName;
+ }
+
+// Remove duplicated letters
+
+ public static String removeDuplicate(String input) {
+ char [] temp = input.toCharArray();
+ int lenght = temp.length;
+
+ int index = 0;
+ int p;
+ for (int i = 0; i < lenght- 1; i++) {
+ p = i + 1;
+ if (!(temp[i] == temp[p])) {
+ temp[index++] = temp[i];
+ }
+ }
+ String output = String.valueOf(Arrays.copyOf(temp, index));
+ output = output + temp[lenght - 1];
+ return output;
+ }
+
+// normalize ending ignoring gender issues
+
+ public static String replaceGenderEnding(String input) {
+ input = input.toUpperCase();
+ String firstPart = input.substring(0, input.length() - 2);
+ String lastTwoChar = input.substring((input.length() - 2), input.length());
+ String[] endingChar = new String[] {"IS", "US", "YS", "ES", "IM", "AS", "UM", "OS"};
+ for (String i : endingChar) {
+ if (lastTwoChar.contains(i)) {
+ lastTwoChar = lastTwoChar.replace(i, "A");
+ }
+ }
+ String output = firstPart + lastTwoChar;
+ return output;
+ }
+
+ public static String normalize(String str) {
+ String result;
+ result = str.toUpperCase();
+ result = NameMatchingUtils.replaceSpecialCharacters(result);
+ return result;
+ }
+
+ public static String nearMatch(String str) {
+ String result;
+ result = replaceInitialCharacter(str);
+ result = soundalike(result);
+ result = removeDuplicate(result);
+ result = replaceGenderEnding(result);
+ return result;
+ }
+
+ public static int modifiedDamerauLevenshteinDistance(String str1, String str2) {
+ if (str1 == str2) {
+ return 0;
+ } else if (str1.isEmpty()) {
+ return str2.length();
+ } else if (str2.isEmpty()) {
+ return str1.length();
+ } else if (str2.length() == 1 && str1.length() == 1 && str1 != str2) {
+ return 1;
+ } else {
+
+ int[][] distanceMatrix = new int[str1.length() + 1][str2.length() + 1];
+
+ for (int i = 0; i <= str1.length(); i++) {
+ distanceMatrix[i][0] = i;
+ }
+
+ for (int j = 0; j <= str2.length(); j++) {
+ distanceMatrix[0][j] = j;
+ }
+
+ for (int i = 1; i <= str1.length(); i++) {
+ for (int j = 1; j <= str2.length(); j++) {
+ int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
+ distanceMatrix[i][j] = Math.min(
+ Math.min(distanceMatrix[i - 1][j] + 1, distanceMatrix[i][j - 1] + 1),
+ distanceMatrix[i - 1][j - 1] + cost);
+
+ if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
+ && str1.charAt(i - 2) == str2.charAt(j - 1)) {
+ distanceMatrix[i][j] = Math.min(distanceMatrix[i][j], distanceMatrix[i - 2][j - 2] + cost);
+ }
+ }
+ }
+ return distanceMatrix[str1.length()][str2.length()];
+ }
+ }
+}
+++ /dev/null
-/**
-* Copyright (C) 2023 EDIT
-* European Distributed Institute of Taxonomy
-* http://www.e-taxonomy.eu
-*
-* The contents of this file are subject to the Mozilla Public License Version 1.1
-* See LICENSE.TXT at the top of this package for the full license terms.
-*/
-package eu.etaxonomy.cdm.common;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * @author andreabee90
- * @since 30.05.2023
- */
-public class CdmUtilsBelenTest {
-
- @Test
- public void testDeleteEmptySpaces() {
- String name= " Quercus robur ";
- Assert.assertEquals("Quercus robur", CdmUtilsBelen.deleteEmptySpaces(name));
-
- }
-
- @Test
- public void testReplaceSpecialCharacters() {
- String name= "áåâãàêèëôõøòóöìíîïüûúùñç";
- Assert.assertEquals("aaaaaeeeooooooiiiiuuuunc",CdmUtilsBelen.replaceSpecialCharacters(name));
- }
-
- @Test
- public void testListToLowerCase() {
- List <String> testList= new ArrayList<>();
- testList.add("NAME 1");
- testList.add("nAmE 2");
- Assert.assertEquals("name 1", CdmUtilsBelen.listToLowerCase(testList).get(0));
- Assert.assertEquals("name 2", CdmUtilsBelen.listToLowerCase(testList).get(1));
- }
-
- @Test
- public void testSoundalike() {
- String name = "ae ia oe oi sc";
- Assert.assertEquals("e a i a s", CdmUtilsBelen.soundalike(name));
- }
-
- @Test
- public void testRemoveDuplicate() {
- String name = "thiiss iss aa striiiing with duupliccaaaatess";
- Assert.assertEquals("this is a string with duplicates", CdmUtilsBelen.removeDuplicate(name));
- }
-
- @Test
- public void testReplacerGenderEnding() {
-// String name="is";
-// String name="us";
-// String name="ys";
-// String name="es";
-// String name="im";
- String name="as";
-// String name="um";
-// String name="os";
- Assert.assertEquals("a", CdmUtilsBelen.replacerGenderEnding(name));
- }
-}
Assert.assertEquals("Str1; ;Str3", CdmUtils.concat(";", str1, " ", str3));
}
- @Test
- public void testmodifiedDamerauLevenshteinDistance() {
-
- int distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
- assertEquals(5,distance);
- distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
- assertEquals(7,distance);
- distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
- assertEquals(5,distance);
- distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
- assertEquals(1,distance);
- distance = CdmUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
- assertEquals(0,distance);
- }
}
\ No newline at end of file
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.common;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author andreabee90
+ * @since 30.05.2023
+ */
+public class NameMatchingUtilsTest {
+
+ @Test
+ public void removeExtraElements() {
+ String name = "Quercus cf. robur";
+ Assert.assertEquals("QUERCUS ROBUR", NameMatchingUtils.removeExtraElements(name));
+ name = "Quercus (something) robur";
+ Assert.assertEquals("QUERCUS ROBUR", NameMatchingUtils.removeExtraElements(name));
+ name = "Quercus (something) cf. robur";
+ Assert.assertEquals("QUERCUS ROBUR", NameMatchingUtils.removeExtraElements(name));
+ name = "Quercus (cf.) robur";
+ Assert.assertEquals("QUERCUS ROBUR", NameMatchingUtils.removeExtraElements(name));
+ }
+
+
+ @Test
+ public void testDeleteEmptySpaces() {
+ String name = " Quercus robur ";
+ Assert.assertEquals("Quercus robur", NameMatchingUtils.deleteEmptySpaces(name));
+
+ }
+
+ @Test
+ public void testReplaceSpecialCharacters() {
+ String name = "áåâãàêèëôõøòóöìíîïüûúùñç";
+ Assert.assertEquals("AAAAAEEEOOOOOOIIIIUUUUNC",NameMatchingUtils.replaceSpecialCharacters(name));
+ }
+
+ @Test
+ public void testListToUpperCase() {
+ List <String> testList = new ArrayList<>();
+ testList.add("NAME 1");
+ testList.add("nAmE 2");
+ Assert.assertEquals("NAME 1", NameMatchingUtils.listToUpperCase(testList).get(0));
+ Assert.assertEquals("NAME 2", NameMatchingUtils.listToUpperCase(testList).get(1));
+ }
+
+ @Test
+ public void testReplaceInitialCharacter() {
+
+ String name = "euphorbia";
+ Assert.assertEquals("UPHORBIA", NameMatchingUtils.replaceInitialCharacter(name));
+ name = "Cnemidia";
+ Assert.assertEquals("NEMIDIA", NameMatchingUtils.replaceInitialCharacter(name));
+ name = "Gnaphalium";
+ Assert.assertEquals("NAPHALIUM", NameMatchingUtils.replaceInitialCharacter(name));
+ name = "Philodendron";
+ Assert.assertEquals("FILODENDRON", NameMatchingUtils.replaceInitialCharacter(name));
+ name = "Tsuga";
+ Assert.assertEquals("SUGA", NameMatchingUtils.replaceInitialCharacter(name));
+ name = "Czerniaevia";
+ Assert.assertEquals("CERNIAEVIA", NameMatchingUtils.replaceInitialCharacter(name));
+ }
+
+ @Test
+ public void testSoundalike() {
+ String name = "ae ia oe oi sc";
+ Assert.assertEquals("E A I A S", NameMatchingUtils.soundalike(name));
+ }
+
+ @Test
+ public void testRemoveDuplicate() {
+ String name = "thiiss iss aa striiiing with duupliccaaaatess";
+ Assert.assertEquals("this is a string with duplicates", NameMatchingUtils.removeDuplicate(name));
+ }
+
+ @Test
+ public void testReplacerGenderEnding() {
+ String name="Qas";
+ Assert.assertEquals("QA", NameMatchingUtils.replaceGenderEnding(name));
+// String name="is";
+// String name="us";
+// String name="ys";
+// String name="es";
+// String name="im";
+// String name="um";
+// String name="os";
+ }
+
+ @Test
+ public void testmodifiedDamerauLevenshteinDistance() {
+
+ int distance = NameMatchingUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxya asrerotciha");
+ assertEquals(5,distance);
+ distance = NameMatchingUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxsa axrerotciha");
+ assertEquals(7,distance);
+ distance = NameMatchingUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynxyas asrerotciha");
+ assertEquals(5,distance);
+ distance = NameMatchingUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxya asterotricha");
+ assertEquals(1,distance);
+ distance = NameMatchingUtils.modifiedDamerauLevenshteinDistance("Gynoxys asterotricha", "Gynoxys asterotricha");
+ assertEquals(0,distance);
+ }
+}
\ No newline at end of file
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.api.service;
+
+import java.util.List;
+
+import eu.etaxonomy.cdm.common.DoubleResult;
+import eu.etaxonomy.cdm.persistence.dto.TaxonNameParts;
+
+/**
+ * @author andreabee90
+ * @since 11.07.2023
+ */
+public interface INameMatchingService {
+
+ public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName,
+ Integer maxDistanceGenus, Integer maxDisEpith);
+}
* Checks whether the name can be deleted if the taxon with taxonUuid will be deleted, too
*/
public DeleteResult isDeletable(UUID nameUuid, DeleteConfiguratorBase config, UUID taxonUuid);
-
- public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName, Integer maxDistanceGenus, Integer maxDisEpith);
}
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.api.service;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Optional;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+import eu.etaxonomy.cdm.common.NameMatchingUtils;
+import eu.etaxonomy.cdm.common.DoubleResult;
+import eu.etaxonomy.cdm.model.name.TaxonName;
+import eu.etaxonomy.cdm.persistence.dao.initializer.IBeanInitializer;
+import eu.etaxonomy.cdm.persistence.dao.name.ITaxonNameDao;
+import eu.etaxonomy.cdm.persistence.dto.TaxonNameParts;
+import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
+
+/**
+ * @author andreabee90
+ * @since 11.07.2023
+ */
+@Service
+@Transactional(readOnly = true)
+public class NameMatchingServiceImpl
+// extends IdentifiableServiceBase<TaxonName,ITaxonNameDao>
+ implements INameMatchingService {
+
+ @Autowired
+ // @Qualifier("defaultBeanInitializer")
+ protected IBeanInitializer defaultBeanInitializer;
+
+ @Autowired
+ private ITaxonNameDao nameDao;
+
+//***************************** CONSTRUCTOR **********************************/
+
+ public NameMatchingServiceImpl(){}
+
+//********************* METHODS ***********************************************//
+
+ /* This is a implementation of the Taxamatch algorithm built by Tony Rees.
+ * It employs a custom Modified Damerau-Levenshtein Distance algorithm
+ * see also https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0107510
+ */
+ //TODO work in progress
+ public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName,
+ Integer maxDisGenus, Integer maxDisEpith) {
+
+ // only one (total) distance should be used.
+
+ if (maxDisGenus == null) {
+ maxDisGenus = 4;
+ }
+
+ if (maxDisEpith == null) {
+ maxDisEpith = 4;
+ }
+
+
+ //0. Parsing and Normalizing
+
+// TODO? Remove all qualifiers such as cf., aff., ?, <i>, x, etc. from the whole input string
+// taxonName=CdmUtilsBelen.removeExtraElements(taxonName);
+// taxonName=CdmUtilsBelen.removeHTMLAmpersand(taxonName);
+
+ TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
+
+ String genusQuery = name.getGenusOrUninomial();
+ String epithetQuery = name.getSpecificEpithet();
+// String infraGenericQuery = name.getInfraGenericEpithet();
+
+ int genusComputedDistance = 0;
+ int epithetComputedDistance = 0;
+
+ String normalizedGenusQuery = NameMatchingUtils.normalize(genusQuery);
+
+ //* phonetic normalization of query (genus)
+ /* this method corresponds to the near match function of Rees 2007
+ * it includes phonetic matches (replace initial characters, soundalike changes, gender endings)
+ */
+
+ String phoneticNormalizedGenusQuery = NameMatchingUtils.nearMatch(normalizedGenusQuery);
+
+
+ //1. Genus pre-filter
+
+ List<String> preFilteredGenusList = prefilterGenus(genusQuery);
+
+ //create result list
+ List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
+
+ for (String preFilteredGenus : preFilteredGenusList) {
+
+ //2. comparison of genus
+
+ String genusNameInDBNormalized = NameMatchingUtils.normalize(preFilteredGenus);
+ String phoneticNormalizedGenusInDB = NameMatchingUtils.nearMatch(genusNameInDBNormalized);
+
+ genusComputedDistance = nameMatchingComputeDistance(phoneticNormalizedGenusQuery, phoneticNormalizedGenusInDB);
+
+ //3. genus post-filter
+
+ postfilterGenus(maxDisGenus, genusQuery, genusComputedDistance, phoneticNormalizedGenusQuery,
+ fullTaxonNamePartsList, preFilteredGenus, phoneticNormalizedGenusInDB);
+ }
+
+ //if only genus is given
+
+ if (epithetQuery==null) {
+ Collections.sort(fullTaxonNamePartsList, (o1,o2) -> o1.getSecondResult().compareTo(o2.getSecondResult()));
+
+ List <DoubleResult<TaxonNameParts, Integer>> exactResults = exactResults(fullTaxonNamePartsList);
+ List <DoubleResult<TaxonNameParts, Integer>> bestResults = bestResults(fullTaxonNamePartsList);
+
+ if(!exactResults.isEmpty()) {
+ return exactResults;
+ } else {
+ return bestResults;
+ }
+
+ } else {
+
+ String normalizedEphitetQuery = NameMatchingUtils.normalize(epithetQuery);
+ String phoneticNormalizedEpithetQuery = NameMatchingUtils.nearMatch(normalizedEphitetQuery);
+
+ // 4. epithet pre-filter
+
+ fullTaxonNamePartsList = prefilterEpithet(fullTaxonNamePartsList, normalizedEphitetQuery);
+
+ List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
+ for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
+
+ String epithetInDB = part.getFirstResult().getSpecificEpithet();
+ String epithetNameInDBNormalized = NameMatchingUtils.normalize(epithetInDB);
+ String phoneticNormalizedEpithetNameInDB = NameMatchingUtils.nearMatch(epithetNameInDBNormalized);
+
+ // 5. comparison of epithet
+ epithetComputedDistance = nameMatchingComputeDistance(phoneticNormalizedEpithetQuery, phoneticNormalizedEpithetNameInDB);
+ int totalDist = part.getSecondResult() + epithetComputedDistance;
+ part.setSecondResult(totalDist) ;
+
+ ///aqui hay error cuando la base solo tiene genero sin epiteto
+
+ // 6. species post-filter
+
+ postfilterEpithet(maxDisEpith, epithetQuery, epithetComputedDistance, normalizedEphitetQuery, epithetList, part,
+ epithetInDB, totalDist);
+ }
+
+ // 6b Infraspecific comparison (pre-filter, comparison, post-filter)
+ //TODO
+
+ // 7. Result shaping
+
+ //-------------------CONTINUE HERE------------------
+
+ Collections.sort(epithetList, (o1,o2) -> o1.getSecondResult().compareTo(o2.getSecondResult()) );
+
+ List <DoubleResult<TaxonNameParts, Integer>> exactResults = exactResults(epithetList);
+ List <DoubleResult<TaxonNameParts, Integer>> bestResults = bestResults(epithetList);
+
+ if(!exactResults.isEmpty()) {
+ return exactResults;
+ } else {
+ return bestResults;
+ }
+ }
+ }
+
+ /**
+ * Deletes common characters at the beginning and end of both parameters.
+ * Returns the space separated concatenation of the remaining strings.
+ *<BR>
+ * Returns empty string if input strings are equal.
+ */
+ public static String trimCommonChar(String queryName, String dbName) {
+
+ String shortenedQueryName = "";
+ String shortenedDBName = "";
+ String tempQueryName;
+ String tempDBName;
+ // trim common leading characters of query and document
+
+ int queryNameLength = queryName.length();
+ int dbNameLength = dbName.length();
+ int largestString = Math.max(queryNameLength, dbNameLength);
+ int i;
+
+ for (i = 0; i < largestString; i++) {
+ if (i >= queryNameLength || i >= dbNameLength || queryName.charAt(i) != dbName.charAt(i)) {
+ // Stop iterating when the characters at the current position are not equal.
+ break;
+ }
+ }
+
+ // Create temp names with common leading characters removed.
+
+ tempQueryName = queryName.substring(i);
+ tempDBName = dbName.substring(i);
+
+ // trim common tailing characters between query and document
+
+ int restantQueryNameLenght = tempQueryName.length();
+ int restantDBNameLenght = tempDBName.length();
+ int shortestString = Math.min(restantQueryNameLenght, restantDBNameLenght);
+ int x;
+ for (x = 0; x < shortestString; x++) {
+ if (tempQueryName.charAt(restantQueryNameLenght - x - 1) != tempDBName
+ .charAt(restantDBNameLenght - x - 1)) {
+ break;
+ }
+ }
+ shortenedQueryName = tempQueryName.substring(0, restantQueryNameLenght - x);
+ shortenedDBName = tempDBName.substring(0, restantDBNameLenght - x);
+
+ if (shortenedQueryName.equals(shortenedDBName)) {
+ return "";
+ }else {
+ return shortenedQueryName +" "+ shortenedDBName;
+ }
+ }
+
+ private int nameMatchingComputeDistance(String strQuery, String strDB) {
+ int computedDistanceTemp;
+ String trimmedStrings = trimCommonChar(strQuery, strDB);
+
+ if ("".equals(trimmedStrings)) {
+ computedDistanceTemp = 0;
+ } else {
+ String restantTrimmedQuery= trimmedStrings.split(" ")[0];
+ String restantTrimmedDB=trimmedStrings.split(" ")[1];
+ computedDistanceTemp = NameMatchingUtils.modifiedDamerauLevenshteinDistance(restantTrimmedQuery,restantTrimmedDB);
+ }
+ return computedDistanceTemp;
+ }
+
+ /**
+ * Compares the first (or last if backwards = true) number of characters
+ * of the 2 strings.
+ * @param count count of characters to compare
+ * @param backwards if true comparison starts from the end of the words
+ */
+ private boolean characterMatches(String str1, String str2, int count, boolean backwards) {
+ if (!backwards) {
+ return str1.substring(0,count).equals(str2.substring(0,count)) ;
+ }else {
+ return str1.substring((str1.length()-count),str1.length()).equals(str2.substring((str2.length()-count),str2.length()));
+ }
+ }
+
+ private List<String> prefilterGenus(String genusQuery) {
+
+ List<String> genusResultList = new ArrayList <>();
+
+ // get a list with all genus/uninomial in the DB
+ String initial= "*";
+ List<String> genusListDB = nameDao.distinctGenusOrUninomial(initial, null, null);
+
+ // TODO implement rule 1a
+ for (String genusDB: genusListDB) {
+ //TODO
+ //if phonetic match add to result
+ }
+
+ //TODO rule 1b requires fetching of species epithets. We need further discussion if we
+ // want to do this in the same way or how the semantics of this rule can be implemented
+ // in the best way.
+
+ // see Rees algorithm rule 1c
+ for (String genusDB: genusListDB) {
+ //check if already in result list
+ if (genusResultList.contains(genusDB)) {
+ continue;
+ }
+ if (Math.abs(genusDB.length()-genusQuery.length()) <= 2) {
+
+ if(genusQuery.length()<5) {
+ // rule 1c.1
+ if ( characterMatches(genusQuery, genusDB, 1, false) ||
+ characterMatches(genusQuery, genusDB, 1, true)) {
+ genusResultList.add(genusDB);
+ }
+ } else if (genusQuery.length()==5) {
+ // rule 1c.2
+ if (characterMatches(genusQuery, genusDB, 2, false) ||
+ characterMatches(genusQuery, genusDB, 3, true)){
+ genusResultList.add(genusDB);
+ }
+ } else if (genusQuery.length()>5){
+ // rule 1c.3
+ if (characterMatches(genusQuery, genusDB, 3, false) ||
+ characterMatches(genusQuery, genusDB, 3, true)){
+ genusResultList.add(genusDB);
+ }
+ }
+ }
+ }
+ return genusResultList;
+ }
+
+ private void postfilterGenus(Integer maxDistanceGenus, String genusQuery, int distance,
+ String normalizedGenusQuery, List<DoubleResult<TaxonNameParts, Integer>> fullTaxonNamePartsList,
+ String preFilteredGenus, String genusNameInDBNormalized) {
+
+ int genusQueryLength = genusQuery.length();
+ int genusDBLength = preFilteredGenus.length();
+ int halfLength = Math.max(genusQueryLength, genusDBLength)/2;
+
+ //Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1
+ if (distance <= maxDistanceGenus) {
+ List<TaxonNameParts> tempParts1 = nameDao.findTaxonNameParts(Optional.of(preFilteredGenus), null, null, null, null, null, null, null, null);
+ for (TaxonNameParts namePart1: tempParts1) {
+ fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance));
+ }
+ } else if(halfLength < maxDistanceGenus && normalizedGenusQuery.substring(0,1).equals(genusNameInDBNormalized.substring(0,1))) {
+ List<TaxonNameParts> tempParts2 = nameDao.findTaxonNameParts(Optional.of(preFilteredGenus),null, null, null, null, null, null, null, null);
+ for (TaxonNameParts namePart2: tempParts2) {
+ fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance));
+ }
+ }
+ }
+
+ private List<DoubleResult<TaxonNameParts, Integer>> prefilterEpithet(
+ List<DoubleResult<TaxonNameParts, Integer>> fullTaxonNamePartsList, String normalizedEphitetQuery) {
+ List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsListTemp = new ArrayList<>();
+ for (DoubleResult<TaxonNameParts, Integer> fullTaxonNameParts: fullTaxonNamePartsList) {
+ if (fullTaxonNameParts.getFirstResult().getSpecificEpithet().length()- normalizedEphitetQuery.length() <= 4) {
+ fullTaxonNamePartsListTemp.add(fullTaxonNameParts);
+ fullTaxonNamePartsList = fullTaxonNamePartsListTemp;
+ }
+ }
+ return fullTaxonNamePartsList;
+ }
+
+ private void postfilterEpithet(Integer maxDisEpith, String epithetQuery, int epithetDistance,
+ String normalizedEphitetQuery, List<DoubleResult<TaxonNameParts, Integer>> epithetList,
+ DoubleResult<TaxonNameParts, Integer> part, String epithetInDB, int totalDist) {
+ int epithetQueryLength=epithetQuery.length();
+ int epithetDBLength=epithetInDB.length();
+ int halfLength=Math.max(epithetDBLength,epithetQueryLength)/2;
+
+ if (totalDist <= maxDisEpith) {
+ epithetList.add(part);
+ }else if (halfLength<maxDisEpith) {
+ if ((normalizedEphitetQuery.substring(0,1).equals(epithetInDB.substring(0,1))
+ && epithetDistance == 2||epithetDistance == 3)||
+ (normalizedEphitetQuery.substring(0,3).equals(epithetInDB.substring(0,3))
+ && epithetDistance == 4)) {
+ epithetList.add(part);
+ }
+ }
+ }
+
+ //checken!!!
+
+ public static List <DoubleResult<TaxonNameParts, Integer>> exactResults (List <DoubleResult<TaxonNameParts, Integer>> list){
+ List <DoubleResult<TaxonNameParts, Integer>> exactResults = new ArrayList<>();
+ for (DoubleResult<TaxonNameParts, Integer> best:list) {
+ if (best.getSecondResult()==0){
+ exactResults.add(best);
+ }
+ }
+ return exactResults;
+ }
+
+ public static List <DoubleResult<TaxonNameParts, Integer>> bestResults (List <DoubleResult<TaxonNameParts, Integer>> list){
+ List <DoubleResult<TaxonNameParts, Integer>> bestResults = new ArrayList<>();
+ for (DoubleResult<TaxonNameParts, Integer> best:list) {
+ if (best.getSecondResult()==1||best.getSecondResult()==2||best.getSecondResult()==3||best.getSecondResult()==4){
+ bestResults.add(best);
+ }
+ }
+ return bestResults;
+ }
+}
\ No newline at end of file
import eu.etaxonomy.cdm.api.service.search.SearchResultBuilder;
import eu.etaxonomy.cdm.api.util.TaxonNamePartsFilter;
import eu.etaxonomy.cdm.common.CdmUtils;
-import eu.etaxonomy.cdm.common.CdmUtilsBelen;
+import eu.etaxonomy.cdm.common.NameMatchingUtils;
import eu.etaxonomy.cdm.common.DoubleResult;
import eu.etaxonomy.cdm.common.URI;
import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
@Autowired
// @Qualifier("defaultBeanInitializer")
protected IBeanInitializer defaultBeanInitializer;
+
+ @Override
+ @Autowired
+ protected void setDao(ITaxonNameDao dao) {
+ this.dao = dao;
+ }
//***************************** CONSTRUCTOR **********************************/
return results;
}
-
- @Override
- @Autowired
- protected void setDao(ITaxonNameDao dao) {
- this.dao = dao;
- }
-
@Override
public Pager<HybridRelationship> getHybridNames(INonViralName name, HybridRelationshipType type, Integer pageSize, Integer pageNumber, List<OrderHint> orderHints, List<String> propertyPaths) {
Integer numberOfResults = dao.countHybridNames(name, type);
M bestMatching = matchingList.iterator().next();
return bestMatching;
}
-
- /* This is a implementation of the Taxamatch algorithm built by Tony Rees.
- * It employs a custom Modified Damerau-Levenshtein Distance algorithm
- * see also https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0107510
- */
- //TODO work in progress
- @Override
- public List<DoubleResult<TaxonNameParts, Integer>> findMatchingNames(String taxonName,
- Integer maxDistanceGenus, Integer maxDisEpith) {
-
- if (maxDistanceGenus == null) {
- maxDistanceGenus = 4;
- }
-
- //0. Normalizing and parsing
-
-// TODO? Remove all qualifiers such as cf., aff., ?, <i>, x, etc.
-
- TaxonName name = (TaxonName) NonViralNameParserImpl.NewInstance().parseFullName(taxonName);
-
- String genusQuery = name.getGenusOrUninomial();
- String epithetQuery = name.getSpecificEpithet();
- int distance=0;
- int epithetDistance=0;
-
- // phonetic normalization of query (genus)
-
- String initCharReplacedQuery = NameServiceImplementBelen.replaceInitialCharacter(genusQuery);
- String normalizedGenusQuery = CdmUtilsBelen.normalize(initCharReplacedQuery);
-
-
- //1. Genus pre-filter
-
- List<String> preFilteredGenusList = nameMatchingGenusPrefilter(genusQuery, initCharReplacedQuery, normalizedGenusQuery);
-
-
- //create result list
- List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList = new ArrayList<>();
-
- for (String preFilteredGenus : preFilteredGenusList) {
-
- //2. comparison of genus
-
- String genusNameInitCharReplaced = NameServiceImplementBelen.replaceInitialCharacter(preFilteredGenus);
- String genusNameInDBNormalized = CdmUtilsBelen.normalize(genusNameInitCharReplaced);
-
- distance = nameMatchingComputeDistance(normalizedGenusQuery, genusNameInDBNormalized);
-
- //3. genus post-filter
-
- nameMatchingPostGenusFilter(maxDistanceGenus, genusQuery, distance, normalizedGenusQuery,
- fullTaxonNamePartsList, preFilteredGenus, genusNameInDBNormalized);
- }
-
- //if only genus is given
-
- if (epithetQuery==null) {
- Collections.sort(fullTaxonNamePartsList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()));
-
- List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(fullTaxonNamePartsList);
- List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(fullTaxonNamePartsList);
-
- if(!exactResults.isEmpty()) {
- return exactResults;
- } else {
- return bestResults;
- }
-
- } else {
-
- String tempEpithPhon = NameServiceImplementBelen.replaceInitialCharacter(epithetQuery);
- String tempEpith = CdmUtilsBelen.normalize(tempEpithPhon);
-
- // 4. epithet pre-filter
- List<DoubleResult<TaxonNameParts,Integer>> fullTaxonNamePartsList2 = new ArrayList<>();
-
- for (DoubleResult<TaxonNameParts, Integer> nameX: fullTaxonNamePartsList) {
- if (nameX.getFirstResult().getSpecificEpithet().length()-tempEpith.length()<=4) {
- fullTaxonNamePartsList2.add(nameX);
- fullTaxonNamePartsList=fullTaxonNamePartsList2;
- }
- }
-
- // 5. comparison of epithet
- if (maxDisEpith==null) {
- maxDisEpith=4;
- }
-
- List <DoubleResult<TaxonNameParts, Integer>> epithetList = new ArrayList<>();
- String queryDocu2;
- for (DoubleResult<TaxonNameParts, Integer> part: fullTaxonNamePartsList) {
-
- String epithetInDB = part.getFirstResult().getSpecificEpithet();
- int lengthEpithetInDB=epithetInDB.length();
- int lengthEpithetQuery=epithetQuery.length();
- int half=Math.max(lengthEpithetInDB,lengthEpithetQuery)/2;
-
- String epithetinDBNorm=NameServiceImplementBelen.replaceInitialCharacter(epithetInDB);
-
- ///aqui hay error cuando la base solo tiene genero sin epiteto
-
- epithetinDBNorm=CdmUtilsBelen.normalize(epithetinDBNorm);
- if (NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm).trim().isEmpty()) {
- queryDocu2="";
- } else {
- queryDocu2=NameServiceImplementBelen.trimCommonChar(tempEpith, epithetinDBNorm);
- }
-
- if (queryDocu2=="") {
- epithetDistance=0;
- } else {
- String inputShort= queryDocu2.split(" ")[0];
- String DbShort=queryDocu2.split(" ")[1];
- epithetDistance= CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
- }
-
- int totalDist = part.getSecondResult() + epithetDistance;
- part.setSecondResult(totalDist) ;
-
- // 6. species post-filter
-
- if (totalDist <= maxDisEpith) {
- epithetList.add(part);
- }else if (half<maxDisEpith) {
- if ((tempEpith.substring(0,1).equals(epithetInDB.substring(0,1))
- && epithetDistance==2||epithetDistance==3)||
- (tempEpith.substring(0,3).equals(epithetInDB.substring(0,3))
- && epithetDistance==4)) {
- epithetList.add(part);
- }
- }
- }
-
- // 6b Infraspecific comparison (pre-filter, comparison, post-filter)
- //TODO
-
- // 7. Result shaping
-
-
- Collections.sort(epithetList, (o1,o2)->o1.getSecondResult().compareTo(o2.getSecondResult()) );
-
- List <DoubleResult<TaxonNameParts, Integer>> exactResults = NameServiceImplementBelen.exactResults(epithetList);
- List <DoubleResult<TaxonNameParts, Integer>> bestResults = NameServiceImplementBelen.bestResults(epithetList);
-
- if(!exactResults.isEmpty()) {
- return exactResults;
- } else {
- return bestResults;
- }
- }
- }
-
- private void nameMatchingPostGenusFilter(Integer maxDistanceGenus, String genusQuery, int distance,
- String normalizedGenusQuery, List<DoubleResult<TaxonNameParts, Integer>> fullTaxonNamePartsList,
- String preFilteredGenus, String genusNameInDBNormalized) {
-
- int genusQueryLength = genusQuery.length();
- int genusDBLength = preFilteredGenus.length();
- int halfLength = Math.max(genusQueryLength, genusDBLength)/2;
-
- //Genera that match in at least 50% are kept. i.e., if genus length = 6(or7) then at least 3 characters must match AND the initial character must match in all cases where ED >1
- if (distance <= maxDistanceGenus) {
- List<TaxonNameParts> tempParts1 = dao.findTaxonNameParts(Optional.of(preFilteredGenus), null, null, null, null, null, null, null, null);
- for (TaxonNameParts namePart1: tempParts1) {
- fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart1, distance));
- }
- } else if(halfLength < maxDistanceGenus && normalizedGenusQuery.substring(0,1).equals(genusNameInDBNormalized.substring(0,1))) {
- List<TaxonNameParts> tempParts2 = dao.findTaxonNameParts(Optional.of(preFilteredGenus),null, null, null, null, null, null, null, null);
- for (TaxonNameParts namePart2: tempParts2) {
- fullTaxonNamePartsList.add(new DoubleResult<TaxonNameParts, Integer>(namePart2, distance));
- }
- }
- }
-
- private int nameMatchingComputeDistance(String tempGenus, String genusNameInDBNormalized) {
- int distance;
- String queryDocu = NameServiceImplementBelen.trimCommonChar(tempGenus, genusNameInDBNormalized);
-
- if ("".equals(queryDocu)) {
- distance = 0;
- } else {
- String inputShort= queryDocu.split(" ")[0];
- String DbShort=queryDocu.split(" ")[1];
- distance = CdmUtils.modifiedDamerauLevenshteinDistance(inputShort,DbShort);
- }
- return distance;
- }
-
- private List<String> nameMatchingGenusPrefilter(String genusQuery, String initCharReplacedQuery, String normalizedGenusQuery) {
-
- List<String> genusResultList = new ArrayList <>();
-
- // get a list with all genus/uninomial in the DB
- String initial= "*";
- List<String> genusListDB = dao.distinctGenusOrUninomial(initial, null, null);
-
- // TODO implement rule 1a
- for (String genusDB: genusListDB) {
- //TODO
- //if phonetic match add to result
- }
-
- //TODO rule 1b requires fetching of species epithets. We need further discussion if we
- // want to do this in the same way or how the semantics of this rule can be implemented
- // in the best way.
-
- // see Rees algorithm rule 1c
- for (String genusDB: genusListDB) {
- //check if already in result list
- if (genusResultList.contains(genusDB)) {
- continue;
- }
- if (Math.abs(genusDB.length()-genusQuery.length()) <= 2) {
-
- if(genusQuery.length()<5) {
- // rule 1c.1
- if ( characterMatches(genusQuery, genusDB, 1, false) ||
- characterMatches(genusQuery, genusDB, 1, true)) {
- genusResultList.add(genusDB);
- }
- } else if (genusQuery.length()==5) {
- // rule 1c.2
- if (characterMatches(genusQuery, genusDB, 2, false) ||
- characterMatches(genusQuery, genusDB, 3, true)){
- genusResultList.add(genusDB);
- }
- } else if (genusQuery.length()>5){
- // rule 1c.3
- if (characterMatches(genusQuery, genusDB, 3, false) ||
- characterMatches(genusQuery, genusDB, 3, true)){
- genusResultList.add(genusDB);
- }
- }
- }
- }
- return genusResultList;
- }
-
- /**
- * Compares the first (or last if backwards = true) number of characters
- * of the 2 strings.
- * @param count count of characters to compare
- * @param backwards if true comparison starts from the end of the words
- */
- private boolean characterMatches(String str1, String str2, int count, boolean backwards) {
- if (!backwards) {
- return str1.substring(0,count).equals(str2.substring(0,count)) ;
- }else {
- return str1.substring((str1.length()-count),str1.length()).equals(str2.substring((str2.length()-count),str2.length()));
- }
- }
}
\ No newline at end of file
+++ /dev/null
-/**
-* Copyright (C) 2023 EDIT
-* European Distributed Institute of Taxonomy
-* http://www.e-taxonomy.eu
-*
-* The contents of this file are subject to the Mozilla Public License Version 1.1
-* See LICENSE.TXT at the top of this package for the full license terms.
-*/
-package eu.etaxonomy.cdm.api.service;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.springframework.stereotype.Service;
-import org.springframework.transaction.annotation.Transactional;
-
-import eu.etaxonomy.cdm.common.DoubleResult;
-import eu.etaxonomy.cdm.persistence.dto.TaxonNameParts;
-
-@Service
-@Transactional(readOnly = true)
-public class NameServiceImplementBelen {
-
-
-// Phonetic changes performed ONLY on the initial characters of each String
-
- public static String replaceInitialCharacter(String inp) {
- String input=inp.toLowerCase();
- String output=input;
- String[][] phoneticChange = {
- {"ae","e"},{"cn","n"},{"ct","t"},{"cz","c"},
- {"dj","d"},{"ea","e"},{"eu","u"},{"gn","n"},
- {"kn","n"},{"mc","mac"},{"mn","n"},{"oe","e"},
- {"qu","q"},{"ph","f"},{"ps","s"},{"pt","t"},
- {"ts","s"},{"wr","r"},{"x","z"}
- };
- for (int i = 0 ; i< phoneticChange.length; i++) {
- if (input.startsWith(phoneticChange[i][0])){
- output= input.replaceFirst(phoneticChange[i][0], phoneticChange[i][1]);
- break;
- }
- }
- return output;
- }
-
- /**
- * Deletes common characters at the beginning and end of both parameters.
- * Returns the space separated concatenation of the remaining strings.
- *<BR>
- * Returns empty string if input strings are equal.
- */
- public static String trimCommonChar(String inputName, String databaseName) {
-
- String shortenedInputName="";
- String shortenedDatabaseName="";
- String tempInputName;
- String tempDatabaseName;
- // trim common leading characters of query and document
-
- int inputNameLength = inputName.length();
- int databaseNameLength = databaseName.length();
- int largestString = Math.max(inputNameLength, databaseNameLength);
- int i;
-
- for (i = 0; i < largestString; i++) {
- if (i >= inputNameLength || i >= databaseNameLength || inputName.charAt(i) != databaseName.charAt(i)) {
- // Stop iterating when the characters at the current position are not equal.
- break;
- }
- }
-
- // Create temp names with common leading characters removed.
- tempInputName = inputName.substring(i);
- tempDatabaseName = databaseName.substring(i);
-
- // trim common tailing characters between query and document
-
- int restantInputNameLenght = tempInputName.length();
- int restantDatabaseNameLenght = tempDatabaseName.length();
- int shortestString = Math.min(restantInputNameLenght, restantDatabaseNameLenght);
- int x;
- for (x = 0; x < shortestString; x++) {
- if (tempInputName.charAt(restantInputNameLenght - x - 1) != tempDatabaseName
- .charAt(restantDatabaseNameLenght - x - 1)) {
- break;
- }
-
- }
- shortenedInputName = tempInputName.substring(0, restantInputNameLenght - x);
- shortenedDatabaseName = tempDatabaseName.substring(0, restantDatabaseNameLenght - x);
-
- if (shortenedInputName.equals(shortenedDatabaseName)) {
- return "";
- }else {
- return shortenedInputName +" "+ shortenedDatabaseName;
- }
- }
-
- public static List <DoubleResult<TaxonNameParts, Integer>> exactResults (List <DoubleResult<TaxonNameParts, Integer>> list){
- List <DoubleResult<TaxonNameParts, Integer>> exactResults = new ArrayList<>();
- for (DoubleResult<TaxonNameParts, Integer> best:list) {
- if (best.getSecondResult()==0){
- exactResults.add(best);
- }
- }
- return exactResults;
- }
-
- public static List <DoubleResult<TaxonNameParts, Integer>> bestResults (List <DoubleResult<TaxonNameParts, Integer>> list){
- List <DoubleResult<TaxonNameParts, Integer>> bestResults = new ArrayList<>();
- for (DoubleResult<TaxonNameParts, Integer> best:list) {
- if (best.getSecondResult()==1||best.getSecondResult()==2||best.getSecondResult()==3||best.getSecondResult()==4){
- bestResults.add(best);
- }
- }
- return bestResults;
- }
-}
--- /dev/null
+/**
+* Copyright (C) 2023 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.api.service;
+
+import java.io.FileNotFoundException;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.unitils.dbunit.annotation.DataSet;
+import org.unitils.spring.annotation.SpringBeanByType;
+
+import eu.etaxonomy.cdm.common.DoubleResult;
+import eu.etaxonomy.cdm.persistence.dto.TaxonNameParts;
+import eu.etaxonomy.cdm.test.integration.CdmTransactionalIntegrationTest;
+import eu.etaxonomy.cdm.test.unitils.CleanSweepInsertLoadStrategy;
+
+/**
+ * @author andreabee90
+ * @since 11.07.2023
+ */
+public class NameMatchingServiceImplTest extends CdmTransactionalIntegrationTest {
+
+ @SpringBeanByType
+ private INameMatchingService nameMatchingService;
+
+ @Test
+ public void testTrimCommonChar() {
+
+ String query ="Nectandra";
+ String document = "Nectalisma";
+
+ Assert.assertEquals("ndr", NameMatchingServiceImpl.trimCommonChar(query, document).split(" ")[0]);
+ Assert.assertEquals("lism", NameMatchingServiceImpl.trimCommonChar(query, document).split(" ")[1]);
+
+ Assert.assertEquals("Equal input should return empty result",
+ "", NameMatchingServiceImpl.trimCommonChar(query, query) );
+ }
+
+ @Test
+ @DataSet(loadStrategy=CleanSweepInsertLoadStrategy.class, value="NameMatchingServiceImplTest.testFindMatchingNames.xml")
+ public void testFindingMatchingNames () {
+
+ String inputName;
+ List<DoubleResult<TaxonNameParts, Integer>> matchResult;
+ DoubleResult<TaxonNameParts, Integer> matchRes;
+
+ // if the query has an exact match on the DB, return the exact match
+ inputName = "Nectandra magnoliifolia";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(1, matchResult.size());
+ matchRes= matchResult.get(0);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("magnoliifolia", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(20, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("10989f63-c52f-4704-9574-2cc0676afe01", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(0,(int) matchRes.getSecondResult());
+
+ inputName = "Nectandra surinamensis";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(2, matchResult.size());
+ matchRes= matchResult.get(0);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("surinamensis", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(27, (int) matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("b184664e-798b-4b50-8807-2163a4de796c", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(0,(int) matchRes.getSecondResult());
+
+ matchRes= matchResult.get(1);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("surinamensis", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(28, (int) matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("b9c8c3ba-bc78-4229-ae7d-b3f7bf23ec85", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(0,(int) matchRes.getSecondResult());
+
+
+ // if the query does not have an exact match on the DB, return the best matches
+
+ inputName = "Nectendra nigre";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(2, matchResult.size());
+ matchRes= matchResult.get(0);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("nigra", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(21, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("cae90b7a-5deb-4838-940f-f85bb685286e", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(2,(int) matchRes.getSecondResult());
+
+ matchRes= matchResult.get(1);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("nigrita", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(22, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("8ad82243-b902-4eb6-990d-59774454b6e7", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(4,(int) matchRes.getSecondResult());
+
+ inputName = "Bectendra nigri";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(2, matchResult.size());
+ matchRes= matchResult.get(0);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("nigra", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(21, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("cae90b7a-5deb-4838-940f-f85bb685286e", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(3,(int) matchRes.getSecondResult());
+
+ matchRes= matchResult.get(1);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("nigrita", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(22, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("8ad82243-b902-4eb6-990d-59774454b6e7", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(4,(int) matchRes.getSecondResult());
+
+ // if the query does not include an epithet
+
+ inputName = "Nectandra";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(20, matchResult.size());
+ matchRes= matchResult.get(0);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("abortiens", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(10, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("6dbd41d1-fe13-4d9c-bb58-31f051c2c384", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(0,(int) matchRes.getSecondResult());
+
+ inputName = "Nectondra";
+ matchResult = nameMatchingService.findMatchingNames(inputName, null, null);
+ Assert.assertEquals(20, matchResult.size());
+ matchRes= matchResult.get(1);
+ Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
+ Assert.assertEquals("acuminata", matchRes.getFirstResult().getSpecificEpithet());
+ Assert.assertEquals(11, (int)matchRes.getFirstResult().getTaxonNameId());
+ Assert.assertEquals("f9e9c13f-5fa5-48d3-88cf-712c921a099e", matchRes.getFirstResult().getTaxonNameUuid().toString());
+ Assert.assertEquals(1,(int) matchRes.getSecondResult());
+ }
+
+ @Override
+ public void createTestDataSet() throws FileNotFoundException {}
+}
\ No newline at end of file
@Override
public void createTestDataSet() throws FileNotFoundException {}
-
-
- @Test
- @DataSet(loadStrategy=CleanSweepInsertLoadStrategy.class, value="NameServiceImplTest.testFindMatchingNames.xml")
- public void testFindingMatchingNames () {
- String inputName;
- List<DoubleResult<TaxonNameParts, Integer>> matchResult;
- DoubleResult<TaxonNameParts, Integer> matchRes;
-
- // if the query has an exact match on the DB, return the exact match
- inputName = "Nectandra magnoliifolia";
- matchResult = nameService.findMatchingNames(inputName, null, null);
- Assert.assertEquals(1, matchResult.size());
- matchRes= matchResult.get(0);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("magnoliifolia", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(20, (int)matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("10989f63-c52f-4704-9574-2cc0676afe01", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(0,(int) matchRes.getSecondResult());
-
- inputName = "Nectandra surinamensis";
- matchResult = nameService.findMatchingNames(inputName, null, null);
- Assert.assertEquals(2, matchResult.size());
- matchRes= matchResult.get(0);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("surinamensis", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(27, (int) matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("b184664e-798b-4b50-8807-2163a4de796c", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(0,(int) matchRes.getSecondResult());
-
- matchRes= matchResult.get(1);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("surinamensis", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(28, (int) matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("b9c8c3ba-bc78-4229-ae7d-b3f7bf23ec85", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(0,(int) matchRes.getSecondResult());
-
-
- // if the query does not have an exact match on the DB, return the best matches
-
- inputName = "Nectendra nigre";
- matchResult = nameService.findMatchingNames(inputName, null, null);
- Assert.assertEquals(2, matchResult.size());
- matchRes= matchResult.get(0);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("nigra", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(21, (int)matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("cae90b7a-5deb-4838-940f-f85bb685286e", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(2,(int) matchRes.getSecondResult());
-
- matchRes= matchResult.get(1);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("nigrita", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(22, (int)matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("8ad82243-b902-4eb6-990d-59774454b6e7", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(4,(int) matchRes.getSecondResult());
-
- // if the query does not include an epithet
-
- inputName = "Nectandra";
- matchResult = nameService.findMatchingNames(inputName, null, null);
- Assert.assertEquals(20, matchResult.size());
- matchRes= matchResult.get(0);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("abortiens", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(10, (int)matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("6dbd41d1-fe13-4d9c-bb58-31f051c2c384", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(0,(int) matchRes.getSecondResult());
-
- inputName = "Nectondra";
- matchResult = nameService.findMatchingNames(inputName, null, null);
- Assert.assertEquals(20, matchResult.size());
- matchRes= matchResult.get(1);
- Assert.assertEquals("Nectandra", matchRes.getFirstResult().getGenusOrUninomial());
- Assert.assertEquals("acuminata", matchRes.getFirstResult().getSpecificEpithet());
- Assert.assertEquals(11, (int)matchRes.getFirstResult().getTaxonNameId());
- Assert.assertEquals("f9e9c13f-5fa5-48d3-88cf-712c921a099e", matchRes.getFirstResult().getTaxonNameUuid().toString());
- Assert.assertEquals(1,(int) matchRes.getSecondResult());
-
-
- }
}
\ No newline at end of file
+++ /dev/null
-/**
-* Copyright (C) 2023 EDIT
-* European Distributed Institute of Taxonomy
-* http://www.e-taxonomy.eu
-*
-* The contents of this file are subject to the Mozilla Public License Version 1.1
-* See LICENSE.TXT at the top of this package for the full license terms.
-*/
-package eu.etaxonomy.cdm.api.service;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * @author andreabee90
- * @since 30.05.2023
- */
-public class NameServiceImplementBelenTest {
-
- @Test
- public void testReplaceInitialCharacter() {
-
-
- String name = "euphorbia";
- Assert.assertEquals("uphorbia", NameServiceImplementBelen.replaceInitialCharacter(name));
- name = "Cnemidia";
- Assert.assertEquals("nemidia", NameServiceImplementBelen.replaceInitialCharacter(name));
- name = "Gnaphalium";
- Assert.assertEquals("naphalium", NameServiceImplementBelen.replaceInitialCharacter(name));
- name = "Philodendron";
- Assert.assertEquals("filodendron", NameServiceImplementBelen.replaceInitialCharacter(name));
- name = "Tsuga";
- Assert.assertEquals("suga", NameServiceImplementBelen.replaceInitialCharacter(name));
- name = "Czerniaevia";
- Assert.assertEquals("cerniaevia", NameServiceImplementBelen.replaceInitialCharacter(name));
- }
-
- @Test
- public void testTrimCommonChar() {
-
- String query ="Nectandra";
- String document = "Nectalisma";
-
- Assert.assertEquals("ndr", NameServiceImplementBelen.trimCommonChar(query, document).split(" ")[0]);
- Assert.assertEquals("lism", NameServiceImplementBelen.trimCommonChar(query, document).split(" ")[1]);
-
- Assert.assertEquals("Equal input should return empty result",
- "", NameServiceImplementBelen.trimCommonChar(query, query) );
- }
-}
\ No newline at end of file