ref #6026 improving specimenType parsers: better recognition of 'Coll. something...
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
index f9a2362fddf255e75022d058ef2b78b5095797dd..80d98addc6fc13bb49402e3778e91116ea7659ca 100644 (file)
@@ -10,6 +10,7 @@
 package eu.etaxonomy.cdm.io.iapt;
 
 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
+import eu.etaxonomy.cdm.api.service.pager.Pager;
 import eu.etaxonomy.cdm.common.CdmUtils;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
@@ -87,14 +88,14 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
-            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
-            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
+            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
+            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
-            Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
+            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
         };
-    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
+    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
 
     private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
     private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
@@ -110,7 +111,8 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private static final Pattern[] specimenTypePatterns = new Pattern[]{
             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
-            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
     };
 
@@ -170,6 +172,11 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
 
     private Reference bookVariedadesTradicionales = null;
 
+    /**
+     * HACK for unit simple testing
+     */
+    boolean _testMode = System.getProperty("TEST_MODE") != null;
+
     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
                             TaxonNode higherTaxonNode, boolean isFossil) {
 
@@ -196,6 +203,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         Partial pupDate = null;
 
         boolean restoreOriginalReference = false;
+        boolean nameIsValid = true;
 
         // preprocess nomRef: separate citation, reference detail, publishing date
         if(!StringUtils.isEmpty(nomRefStr)){
@@ -257,10 +265,18 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         if(!StringUtils.isEmpty(notesTxt)){
             notesTxt = notesTxt.replace("Notes: ", "").trim();
             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
+            nameIsValid = false;
+
         }
         if(!StringUtils.isEmpty(caveats)){
             caveats = caveats.replace("Caveats: ", "").trim();
             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
+            nameIsValid = false;
+        }
+
+        if(nameIsValid){
+            // Status is always considered valid if no notes and cavets are set
+            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
         }
 
         getNameService().save(taxonName);
@@ -331,11 +347,26 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         }
 
         getTaxonService().save(taxon);
+
+        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
+            // try to find the genus, it should have been imported already, Genera are coming first in the import file
+            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
+            if(genus != null){
+                higherTaxonNode = genus.getTaxonNodes().iterator().next();
+            } else {
+                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
+            }
+        }
+
         if(higherTaxonNode != null){
             higherTaxonNode.addChildTaxon(taxon, null, null);
             getTaxonNodeService().save(higherTaxonNode);
         }
 
+        if(taxonName.getRank().isGenus()){
+            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
+        }
+
         return taxon;
     }
 
@@ -530,7 +561,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         return fieldUnit;
     }
 
-    private Partial parseDate(String regNumber, String dateStr) {
+    protected Partial parseDate(String regNumber, String dateStr) {
 
         Partial pupDate = null;
         boolean parseError = false;
@@ -675,11 +706,12 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
      * @param regNumber
      * @return
      */
-    private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
+    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 
         DerivedUnit specimen = null;
 
         String collectionCode = null;
+        String collectionTitle = null;
         String subCollectionStr = null;
         String instituteStr = null;
         String accessionNumber = null;
@@ -707,21 +739,23 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             for (Pattern p : specimenTypePatterns) {
                 Matcher m = p.matcher(text);
                 if (m.matches()) {
-                    // collection code is mandatory
+                    // collection code or collectionTitle is mandatory
                     try {
                         collectionCode = m.group("colCode");
                     } catch (IllegalArgumentException e){
                         // match group colCode not found
                     }
+
                     try {
-                        subCollectionStr = m.group("subCollection");
+                        instituteStr = m.group("institute");
                     } catch (IllegalArgumentException e){
-                        // match group subCollection not found
+                        // match group col_name not found
                     }
+
                     try {
-                        instituteStr = m.group("institute");
+                        subCollectionStr = m.group("subCollection");
                     } catch (IllegalArgumentException e){
-                        // match group col_name not found
+                        // match group subCollection not found
                     }
                     try {
                         accessionNumber = m.group("accNumber");
@@ -854,8 +888,12 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             for(String text : nameAnnotations.keySet()){
                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
             }
-            getNameService().save(taxonName);
         }
+
+        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
+
+        getNameService().save(taxonName);
+
         return taxonName;
     }
 
@@ -918,7 +956,9 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
                 collection.setSuperCollection(superCollection);
             }
             collectionMap.put(key, collection);
-            getCollectionService().save(collection);
+            if(!_testMode) {
+                getCollectionService().save(collection);
+            }
         }
 
         return collection;
@@ -998,6 +1038,15 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         value = StringUtils.replace(value, "c$k", "č");
         value = StringUtils.replace(value, " U$K", " Š");
 
+        value = StringUtils.replace(value, "O>U>!", "Ø");
+        value = StringUtils.replace(value, "o>!", "ø");
+        value = StringUtils.replace(value, "S$K", "Ŝ");
+        value = StringUtils.replace(value, ">l", "ğ");
+
+        value = StringUtils.replace(value, "§B>i", "ł");
+
+
+
         return value;
     }
 
@@ -1040,6 +1089,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         ((IAPTImportState)state).setCurrentTaxon(taxon);
 
 
+        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
                return;
     }
 
@@ -1087,9 +1137,9 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
            return rankFamilyIncertisSedis();
         } else if(name.matches(".*phyta$|.*mycota$")){
-           return Rank.SECTION_BOTANY();
+           return Rank.PHYLUM();
         } else if(name.matches(".*phytina$|.*mycotina$")){
-           return Rank.SUBSECTION_BOTANY();
+           return Rank.SUBPHYLUM();
         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
             return rankUnrankedSupraGeneric();
         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){