From fda35d40a4141ad22957475e05043cf6b6c5c9fe Mon Sep 17 00:00:00 2001 From: =?utf8?q?Andreas=20M=C3=BCller?= Date: Thu, 1 Feb 2024 22:50:59 +0100 Subject: [PATCH] ref#10446 normalize to IPNI standard --- .../cdm/io/wfo/out/WfoBackboneExport.java | 29 +++++++++------- .../out/WfoBackboneExportConfigurator.java | 9 +++++ .../cache/agent/TeamDefaultCacheStrategy.java | 26 +++++++++++++- .../agent/TeamDefaultCacheStrategyTest.java | 34 +++++++++++++++++++ 4 files changed, 85 insertions(+), 13 deletions(-) diff --git a/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExport.java b/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExport.java index 906c8dcb56..c888fa221a 100644 --- a/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExport.java +++ b/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExport.java @@ -63,6 +63,7 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase; import eu.etaxonomy.cdm.model.taxon.TaxonNode; import eu.etaxonomy.cdm.model.taxon.TaxonNodeStatus; import eu.etaxonomy.cdm.model.term.IdentifierType; +import eu.etaxonomy.cdm.strategy.cache.agent.TeamDefaultCacheStrategy; /** * Classification or taxon tree exporter into WFO Backbone format. @@ -639,7 +640,8 @@ public class WfoBackboneExport //authorship //TODO 3 handle empty authorship cache warning - csvLine[table.getIndex(WfoBackboneExportTable.NAME_AUTHORSHIP)] = normalizedAuthor(name); + csvLine[table.getIndex(WfoBackboneExportTable.NAME_AUTHORSHIP)] + = normalizedAuthor(state, name); //family (use familystr if provided, otherwise try to compute from the family taxon String familyStr = state.getFamilyStr(); @@ -707,17 +709,19 @@ public class WfoBackboneExport return wfoId; } - //TODO 2 make it public somewhere in author formatter - private String normalizedAuthor(TaxonName name) { - if (isBlank(name.getAuthorshipCache())) { + private String normalizedAuthor(WfoBackboneExportState state, TaxonName name) { + if (name == null) { return null; + } else if (state.getConfig().isNormalizeAuthorsToIpniStandard()) { + return TeamDefaultCacheStrategy.removeWhitespaces(name.getAuthorshipCache()); + } else { + String result = name.getAuthorshipCache(); + if (result == null) { + return null; + }else { + return result.replaceAll("\\s+", " ").trim(); + } } - String result = name.getAuthorshipCache(); - result = result.replaceAll("\\.\\s+", ".") - .replaceAll("\\.\\&", ". &") - .replaceAll("\\.ex\\s+", ". ex ") - ; - return result; } private Set getOrthographicVariants(TaxonName name) { @@ -759,8 +763,9 @@ public class WfoBackboneExport //authorship, take from mainname if it does not exist //TODO 3 take from csvLine of both names - if (isBlank(normalizedAuthor(name))) { - csvLine[table.getIndex(WfoBackboneExportTable.NAME_AUTHORSHIP)] = normalizedAuthor(mainName); + if (isBlank(normalizedAuthor(state, name))) { + csvLine[table.getIndex(WfoBackboneExportTable.NAME_AUTHORSHIP)] + = normalizedAuthor(state, mainName); } //nom. ref, take from main name if it does not exist diff --git a/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExportConfigurator.java b/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExportConfigurator.java index 5276a58216..be9580c718 100644 --- a/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExportConfigurator.java +++ b/cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/wfo/out/WfoBackboneExportConfigurator.java @@ -40,6 +40,8 @@ public class WfoBackboneExportConfigurator private String sourceLinkBaseUrl = null; + private boolean normalizeAuthorsToIpniStandard = true; + private static final WfoBackboneExportTransformer transformer = new WfoBackboneExportTransformer(); //************************* FACTORY ******************************/ @@ -147,4 +149,11 @@ public class WfoBackboneExportConfigurator public void setSourceLinkBaseUrl(String sourceLinkBaseUrl) { this.sourceLinkBaseUrl = sourceLinkBaseUrl; } + + public boolean isNormalizeAuthorsToIpniStandard() { + return normalizeAuthorsToIpniStandard; + } + public void setNormalizeAuthorsToIpniStandard(boolean normalizeAuthorsToIpniStandard) { + this.normalizeAuthorsToIpniStandard = normalizeAuthorsToIpniStandard; + } } \ No newline at end of file diff --git a/cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategy.java b/cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategy.java index 8d3cc75089..0615374e70 100644 --- a/cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategy.java +++ b/cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategy.java @@ -22,7 +22,9 @@ import eu.etaxonomy.cdm.strategy.StrategyBase; /** * @author AM */ -public class TeamDefaultCacheStrategy extends StrategyBase implements INomenclaturalAuthorCacheStrategy { +public class TeamDefaultCacheStrategy + extends StrategyBase + implements INomenclaturalAuthorCacheStrategy { private static final long serialVersionUID = 8375295443642690479L; @SuppressWarnings("unused") @@ -220,4 +222,26 @@ public class TeamDefaultCacheStrategy extends StrategyBase implements INomenclat return str + ET_AL_TEAM_CONCATINATION_ABBREV + "al."; } + /** + * Removes the whitespaces in an authorship string + * to be compliant with IPNI abbreviated authorship + * standard. + * @param authorship + * @return the authorship without certain whitespaces + */ + public static String removeWhitespaces(String authorship) { + if (authorship == null) { + return null; + } + String result = authorship + .replaceAll("\\.\\s+", ".") //remove whitespace after "." + .replaceAll("\\.\\&", ". &") //... but add whitespace before "&" + .replaceAll("\\.ex\\s", ". ex ") //...and add whitespace between "." and "ex " + .replaceAll("\\s+", " ") //replace multiple whitespaces by a single one + .replaceAll("\\s+,", ",") //remove whitespaces before "," + .trim() //trim + ; + return result; + } + } \ No newline at end of file diff --git a/cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategyTest.java b/cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategyTest.java index e1f96f4775..21def97780 100644 --- a/cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategyTest.java +++ b/cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/cache/agent/TeamDefaultCacheStrategyTest.java @@ -239,4 +239,38 @@ public class TeamDefaultCacheStrategyTest { person1.setGivenName("O."); Assert.assertEquals("team1 title cache should be P1FN, O.", "P1FN, O.", team1.getTitleCache()); } + + @Test + public final void testRemoveWhitespaces() { + String author = null; + Assert.assertEquals(null, TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = " "; + Assert.assertEquals("", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "Mill. "; + Assert.assertEquals("Mill.", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = " Miller "; + Assert.assertEquals("Result should always be trimed", "Miller", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "A. Mill."; + Assert.assertEquals("A.Mill.", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "A. Mill."; + Assert.assertEquals("A.Mill.", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "A. Mill."; + Assert.assertEquals("A.Mill.", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "A. Mill. & B. Kohl.-Haber"; + Assert.assertEquals("A.Mill. & B.Kohl.-Haber", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = "A. Mill. ,J. N. Bohl. f.& B. Kohl.-Haber"; + Assert.assertEquals("A.Mill.,J.N.Bohl.f. & B.Kohl.-Haber", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + author = " (Ab. ex CD. , All , Bet & J.Vall.) A. Mill.ex Kohl."; + Assert.assertEquals("(Ab. ex CD., All, Bet & J.Vall.) A.Mill. ex Kohl.", TeamDefaultCacheStrategy.removeWhitespaces(author)); + + } } -- 2.34.1