From 5e52d4ec10f1bf3fa1ce10019c55cbf0aadfad80 Mon Sep 17 00:00:00 2001 From: Andreas Kohlbecker Date: Thu, 15 Sep 2016 17:32:37 +0200 Subject: [PATCH] ref #6026 solving problems in date and specimenType parsers, also implementing tests --- .../cdm/io/iapt/IAPTExcelImport.java | 35 ++++++--- .../etaxonomy/cdm/io/iapt/IAPTImportTest.java | 76 +++++++++++++++++++ 2 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java diff --git a/app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java b/app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java index 4793a0c0..3e32113a 100644 --- a/app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java +++ b/app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java @@ -88,11 +88,11 @@ public class IAPTExcelImport extends Simp Pattern.compile("^(?\\p{L}+\\.?)\\s(?[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999 Pattern.compile("^(?\\p{L}+\\.?),?\\s?(?(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12 Pattern.compile("^(?[0-9]{1,2})([\\.\\-/])(\\s?)(?[0-1]?[0-9])\\2\\3(?(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969 - Pattern.compile("^(?[0-9]{1,2})([\\.\\-/])(?[IVX]{1,2})\\2(?(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969 - Pattern.compile("^(?:(?[0-9]{1,2})(?:\\sde)\\s)(?\\p{L}+)\\sde\\s(?(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999 + Pattern.compile("^(?[0-9]{1,2})([\\.\\-/])(?[IVX]{1,2})\\2(?(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969 + Pattern.compile("^(?:(?[0-9]{1,2})(?:\\sde)\\s)?(?\\p{L}+)\\sde\\s(?(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999 Pattern.compile("^(?[0-1]?[0-9])([\\.\\-/])(?(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969 Pattern.compile("^(?(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?[0-1]?[0-9])$"),// partial date like 1999-04 - Pattern.compile("^(?[IVX]{1,2})([\\.\\-/])(?(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969 + Pattern.compile("^(?[IVX]{1,2})([\\.\\-/])(?(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969 Pattern.compile("^(?[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?\\p{L}+\\.?),?\\s?(?(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999 }; private static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?.*?))(?:[Hh]olotype:(?.*?)\\.?)?(?:[Ii]sotype[^:]*:(?.*)\\.?)?\\.?$"); @@ -111,7 +111,8 @@ public class IAPTExcelImport extends Simp private static final Pattern[] specimenTypePatterns = new Pattern[]{ Pattern.compile("^(?[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?.*[^\\)])\\))(?.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222 Pattern.compile("^(?[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?[^\\.,;]*)(.))(?.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78 - Pattern.compile("^(?:Coll\\.\\s.*?)\\s(?Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62 + Pattern.compile("^(?Coll\\.\\s.*?)\\s+(?(Praep|slide).*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62 + // Pattern.compile("^.*(?Praep.*)$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62 Pattern.compile("^(?[A-Z]+)(?:\\s+(?.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any }; @@ -171,6 +172,11 @@ public class IAPTExcelImport extends Simp private Reference bookVariedadesTradicionales = null; + /** + * HACK for unit simple testing + */ + boolean _testMode = System.getProperty("TEST_MODE") != null; + private Taxon makeTaxon(HashMap record, SimpleExcelTaxonImportState state, TaxonNode higherTaxonNode, boolean isFossil) { @@ -555,7 +561,7 @@ public class IAPTExcelImport extends Simp return fieldUnit; } - private Partial parseDate(String regNumber, String dateStr) { + protected Partial parseDate(String regNumber, String dateStr) { Partial pupDate = null; boolean parseError = false; @@ -700,11 +706,12 @@ public class IAPTExcelImport extends Simp * @param regNumber * @return */ - private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) { + protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) { DerivedUnit specimen = null; String collectionCode = null; + String collectionTitle = null; String subCollectionStr = null; String instituteStr = null; String accessionNumber = null; @@ -732,21 +739,23 @@ public class IAPTExcelImport extends Simp for (Pattern p : specimenTypePatterns) { Matcher m = p.matcher(text); if (m.matches()) { - // collection code is mandatory + // collection code or collectionTitle is mandatory try { collectionCode = m.group("colCode"); } catch (IllegalArgumentException e){ // match group colCode not found } + try { - subCollectionStr = m.group("subCollection"); + instituteStr = m.group("institute"); } catch (IllegalArgumentException e){ - // match group subCollection not found + // match group col_name not found } + try { - instituteStr = m.group("institute"); + subCollectionStr = m.group("subCollection"); } catch (IllegalArgumentException e){ - // match group col_name not found + // match group subCollection not found } try { accessionNumber = m.group("accNumber"); @@ -947,7 +956,9 @@ public class IAPTExcelImport extends Simp collection.setSuperCollection(superCollection); } collectionMap.put(key, collection); - getCollectionService().save(collection); + if(!_testMode) { + getCollectionService().save(collection); + } } return collection; diff --git a/app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java b/app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java new file mode 100644 index 00000000..2bdc5f0e --- /dev/null +++ b/app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java @@ -0,0 +1,76 @@ +package eu.etaxonomy.cdm.io.iapt; + +import eu.etaxonomy.cdm.model.occurrence.Collection; +import eu.etaxonomy.cdm.model.occurrence.FieldUnit; +import org.junit.Before; +import org.junit.Test; +import org.springframework.util.Assert; + +/** + * Created by andreas on 9/15/16. + */ +public class IAPTImportTest { + + IAPTExcelImport importer = null; + + @Before + public void setup(){ + System.getProperties().put("TEST_MODE", "1"); + importer = new IAPTExcelImport(); + } + + @Test + public void testDateParser(){ + + String[] dateStrings = new String[]{ + "April 12, 1969", + "april 12th 1999", + "April 99", + "April, 1999", + "Apr. 12", + "12.04.1969", + "12. 04. 1969", + "12/04/1969", + "12-04-1969", + "12 de Enero de 1999", + "Enero de 1999", + "04.1969", + "04/1969", + "04-1969", + "1999-04", + "VI-1969", + "12-VI-1969", + "12. April 1969", + "april 1999", + "22 Dec.1999" + }; + + for (String d: dateStrings) { + Assert.notNull(importer.parseDate("0", d), "Could not parse " + d); + } + } + + @Test + public void testSpecimentTypeParser(){ + + FieldUnit fu = FieldUnit.NewInstance(); + Collection collection = null; + + String[] typeStrings = new String[]{ + "Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. No. Eu-PL 72", + "LE 1700b-114", + "AD 99530159", + "STU P 1425", + "GAUF (Gansu Agricultural University) No. 1207-1222", + "KASSEL Coll. Krasske, Praep. DII 78", + "Coll. Lange-Bertalot, Botanisches Institut, Frankfurt am Main slide Eh-B 91", + "Coll. Østrup, Botan. Museum Copenhagen, Dänemark Praep. 3944", + + + }; + for (String t: typeStrings) { + Assert.notNull(importer.parseSpecimenType(fu, IAPTExcelImport.TypesName.holotype, collection, t, "0"), "Could not parse: " + t); + } + + } +} -- 2.34.1