ref #6026 fixing rank
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
index c80551f00b2c4a1dd597cf0905c7874ca5e9d87d..37ca3781d302d1e00c9d930beb72bf44ed3cfbc5 100644 (file)
@@ -13,14 +13,19 @@ import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
 import eu.etaxonomy.cdm.common.CdmUtils;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
+import eu.etaxonomy.cdm.model.agent.Institution;
+import eu.etaxonomy.cdm.model.agent.Person;
+import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
 import eu.etaxonomy.cdm.model.common.*;
 import eu.etaxonomy.cdm.model.name.*;
-import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
-import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
-import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
+import eu.etaxonomy.cdm.model.occurrence.*;
+import eu.etaxonomy.cdm.model.occurrence.Collection;
 import eu.etaxonomy.cdm.model.reference.Reference;
+import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
+import eu.etaxonomy.cdm.model.reference.ReferenceType;
 import eu.etaxonomy.cdm.model.taxon.*;
 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
+import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
 import org.apache.commons.lang.ArrayUtils;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.StringUtils;
@@ -28,6 +33,8 @@ import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.joda.time.DateTimeFieldType;
 import org.joda.time.Partial;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
 import org.springframework.stereotype.Component;
 
 import java.util.*;
@@ -70,36 +77,79 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private  static List<String> expectedKeys= Arrays.asList(new String[]{
             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
 
-    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
-    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
+    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
+    private static final Pattern[] datePatterns = new Pattern[]{
+            // NOTE:
+            // The order of the patterns is extremely important!!!
+            //
             // all patterns cover the years 1700 - 1999
             Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
-            Pattern.compile("^(?<day>[0-9]{1,2})([\\./])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969
-            Pattern.compile("^(?:(?<day>[0-9]{1,2})[\\./]?\\s)?(?<monthName>[\\S\\D]+)\\s(?<year>(?:1[7,8,9])?[0-9]{2})$") // full date like 12. April 1969 or april 1999 or April 99
+            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
+            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
+            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
+            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
+            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
+            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
+            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
+            Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
+            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
         };
-    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
+    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
+
+    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
+    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
+    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
+
+    private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
+    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
+    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
+
+    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
+    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
+
+    private static final Pattern[] specimenTypePatterns = new Pattern[]{
+            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
+            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
+            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
+    };
 
     private static Map<String, Integer> monthFromNameMap = new HashMap<>();
+
     static {
         String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
         String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
         String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
         String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
+        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
+        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
+        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
+        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
+        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
+        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
 
-        String[][] perLang =  new String[][]{ck, de, fr, en};
+        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
 
         for (String[] months: perLang) {
             for(int m = 1; m < 13; m++){
-                monthFromNameMap.put(months[m - 1], m);
+                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
             }
         }
 
         // special cases
-        monthFromNameMap.put("Mar", 3);
+        monthFromNameMap.put("mar", 3);
+        monthFromNameMap.put("dec", 12);
+        monthFromNameMap.put("Februari", 2);
     }
 
+
+    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
+
+    private Map<String, Collection> collectionMap = new HashMap<>();
+
+
     enum TypesName {
-        type, holotype, isotype;
+        fieldUnit, holotype, isotype;
 
         public SpecimenTypeDesignationStatus status(){
             switch (this) {
@@ -118,11 +168,13 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private Rank familyIncertisSedis = null;
     private AnnotationType annotationTypeCaveats = null;
 
+    private Reference bookVariedadesTradicionales = null;
+
     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
                             TaxonNode higherTaxonNode, boolean isFossil) {
 
-        String line = state.getCurrentLine() + ": ";
-
+        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
+        String regStr = getValue(record, REGISTRATION, true);
         String titleCacheStr = getValue(record, FULLNAME, true);
         String nameStr = getValue(record, NAMESTRING, true);
         String authorStr = getValue(record, AUTHORSTRING, true);
@@ -131,89 +183,76 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         String notesTxt = getValue(record, NOTESTXT, true);
         String caveats = getValue(record, CAVEATS, true);
         String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
+        String fullBasionymStr = getValue(record, FULLBASIONYM, true);
+        String basionymNameStr = getValue(record, FULLBASIONYM, true);
         String synSubstStr = getValue(record, SYNSUBSTSTR, true);
         String typeStr = getValue(record, TYPE, true);
 
+
         String nomRefTitle = null;
-        String nomRefDetail = null;
+        String nomRefDetail;
         String nomRefPupDate = null;
-        String nomRefPupDay = null;
-        String nomRefPupMonth = null;
-        String nomRefPupMonthName = null;
-        String nomRefPupYear = null;
+        String nomRefIssue = null;
+        Partial pupDate = null;
+
+        boolean restoreOriginalReference = false;
 
         // preprocess nomRef: separate citation, reference detail, publishing date
         if(!StringUtils.isEmpty(nomRefStr)){
             nomRefStr = nomRefStr.trim();
-            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
-            if(m.matches()){
-                nomRefTitle = m.group(1);
-                nomRefDetail = m.group(2);
-                nomRefPupDate = m.group(3).trim();
-
-                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
-                for(Pattern p : nomRefPubDatePs){
-                    Matcher m2 = p.matcher(nomRefPupDate);
-                    if(m2.matches()){
-                        try {
-                            nomRefPupYear = m2.group("year");
-                        } catch (IllegalArgumentException e){
-                            // named capture group not found
-                        }
-                        try {
-                            nomRefPupMonth = m2.group("month");
-                        } catch (IllegalArgumentException e){
-                            // named capture group not found
-                        }
-                        try {
-                            nomRefPupMonthName = m2.group("monthName");
-                            nomRefPupMonth = monthFromName(nomRefPupMonthName);
-                        } catch (IllegalArgumentException e){
-                            // named capture group not found
-                        }
-                        try {
-                            nomRefPupDay = m2.group("day");
-                        } catch (IllegalArgumentException e){
-                            // named capture group not found
-                        }
 
-                        if(nomRefPupYear == null){
-                            logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
-                        }
-                        if(nomRefPupYear.length() == 2 ){
-                            // it is an abbreviated year from the 19** years
-                            nomRefPupYear = "19" + nomRefPupYear;
-                        }
-                        nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
-                        break;
-                    }
-                }
-                if(nomRefPupYear == null){
-                    logger.warn("Pub year not found in " + nomRefPupDate + " from " + nomRefStr );
-                    // FIXME in in J. Eur. Orchideen 30: 128. 30.09.97 (Vorabdr.).
-                }
-                List<DateTimeFieldType> types = new ArrayList<>();
-                List<Integer> values = new ArrayList<>();
-                if(nomRefPupYear != null){
-                    types.add(DateTimeFieldType.year());
-                    values.add(Integer.parseInt(nomRefPupYear));
-                }
-                if(nomRefPupMonth != null){
-                    types.add(DateTimeFieldType.monthOfYear());
-                    values.add(Integer.parseInt(nomRefPupMonth));
-                }
-                if(nomRefPupDay != null){
-                    types.add(DateTimeFieldType.dayOfMonth());
-                    values.add(Integer.parseInt(nomRefPupDay));
+            // handle the special case which is hard to parse:
+            //
+            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
+            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
+
+                if(bookVariedadesTradicionales == null){
+                    bookVariedadesTradicionales = ReferenceFactory.newBook();
+                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
+                    bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
+                    getReferenceService().save(bookVariedadesTradicionales);
                 }
-                Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
+                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
+                restoreOriginalReference = true;
+            }
 
+            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
+            if(m.matches()){
+                nomRefTitle = m.group("title");
+                nomRefDetail = m.group("detail");
+                nomRefPupDate = m.group("date").trim();
+                nomRefIssue = m.group("issue");
+
+                pupDate = parseDate(regNumber, nomRefPupDate);
+                if (pupDate != null) {
+                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
+                } else {
+                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
+                }
             } else {
                 nomRefTitle = nomRefStr;
             }
         }
 
-        BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle);
+        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
+
+        // always add the original strings of parsed data as annotation
+        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
+                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
+                        "\n -  '" + TYPE + "': " + typeStr +
+                        "\n -  '" + REGISTRATION  + "': " + regStr
+                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
+
+        if(restoreOriginalReference){
+            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
+        }
+        if(pupDate != null) {
+            taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
+        }
+        if(nomRefIssue != null) {
+            ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
+        }
+
 
         if(!StringUtils.isEmpty(notesTxt)){
             notesTxt = notesTxt.replace("Notes: ", "").trim();
@@ -223,7 +262,8 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             caveats = caveats.replace("Caveats: ", "").trim();
             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
         }
-        //
+
+        getNameService().save(taxonName);
 
         // Namerelations
         if(!StringUtils.isEmpty(authorsSpelling)){
@@ -254,7 +294,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         // Replaced Synonyms
         if(!StringUtils.isEmpty(fullSynSubstStr)){
             fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
-            BotanicalName replacedSynonymName = makeBotanicalName(state, fullSynSubstStr, synSubstStr, null, null);
+            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
             replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
             getNameService().save(replacedSynonymName);
         }
@@ -262,6 +302,19 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         Reference sec = state.getConfig().getSecReference();
         Taxon taxon = Taxon.NewInstance(taxonName, sec);
 
+        // Basionym
+        if(fullBasionymStr != null){
+            fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
+            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
+            BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
+            getNameService().save(basionym);
+            taxonName.addBasionym(basionym);
+
+            Synonym syn = Synonym.NewInstance(basionym, sec);
+            taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
+            getTaxonService().save(syn);
+        }
+
         // Markers
         if(isFossil){
             taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
@@ -269,28 +322,12 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
 
         // Types
         if(!StringUtils.isEmpty(typeStr)){
-            Matcher m = typeSplitPattern.matcher(typeStr);
 
-            if(m.matches()){
-                String typeString = m.group(TypesName.type.name());
-                boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
-
-                if(isFieldUnit) {
-                    // type as fieldUnit
-                    FieldUnit fu = FieldUnit.NewInstance();
-                    fu.setTitleCache(typeString, true);
-                    getOccurrenceService().save(fu);
-
-                    // all others ..
-                    addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false);
-                    addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true);
-                } else {
-                    TaxonNameBase typeName = nameParser.parseFullName(typeString);
-                    taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
-                }
+            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
+                makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
+            } else {
+                makeNameTypeData(typeStr, taxonName, regNumber, state);
             }
-            getNameService().save(taxonName);
-
         }
 
         getTaxonService().save(taxon);
@@ -300,14 +337,275 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         }
 
         return taxon;
+    }
+
+    private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+
+        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
+
+        if(m.matches()){
+            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
+            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
+            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
+            if(fieldUnit == null) {
+                // create a field unit with only a titleCache using the fieldUnitStr substring
+                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
+                fieldUnit = FieldUnit.NewInstance();
+                fieldUnit.setTitleCache(fieldUnitStr, true);
+                getOccurrenceService().save(fieldUnit);
+            }
+            getOccurrenceService().save(fieldUnit);
+
+            // all others ..
+            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
+            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
+
+        } else {
+            // create a field unit with only a titleCache using the full typeStr
+            FieldUnit fieldUnit = FieldUnit.NewInstance();
+            fieldUnit.setTitleCache(typeStr, true);
+            getOccurrenceService().save(fieldUnit);
+            logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
+        }
+        getNameService().save(taxonName);
+    }
+
+    private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+
+        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
+        if(nameStr.isEmpty()) {
+            return;
+        }
+
+        String basionymNameStr = null;
+        String noteStr = null;
+        String agentStr = null;
+
+        Matcher m;
+
+        if(typeStr.startsWith("not to be indicated")){
+            // Special case:
+            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
+            // FIXME
+            m = typeNameSpecialSplitPattern.matcher(nameStr);
+            if(m.matches()){
+                nameStr = m.group("name");
+                noteStr = m.group("note");
+                agentStr = m.group("agent");
+                // TODO better import of agent?
+                if(agentStr != null){
+                    noteStr = noteStr + ": " + agentStr;
+                }
+            }
+        } else {
+            // Generic case
+            m = typeNameBasionymPattern.matcher(nameStr);
+            if (m.find()) {
+                basionymNameStr = m.group("basionymName");
+                if (basionymNameStr != null) {
+                    nameStr = nameStr.replace(m.group(0), "");
+                }
+            }
+
+            m = typeNameNotePattern.matcher(nameStr);
+            if (m.find()) {
+                noteStr = m.group(1);
+                if (noteStr != null) {
+                    nameStr = nameStr.replace(m.group(0), "");
+                }
+            }
+        }
+
+        BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+
+        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
+            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
+        }
+
+        if(basionymNameStr != null){
+            BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+            getNameService().save(basionymName);
+            typeName.addBasionym(basionymName);
+        }
+
+
+        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
+        nameTypeDesignation.setTypeName(typeName);
+        getNameService().save(typeName);
+
+        if(noteStr != null){
+            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
+        }
+        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
+
+    }
+
+    /**
+     * Currently only parses the collector, fieldNumber and the collection date.
+     *
+     * @param fieldUnitStr
+     * @param regNumber
+     * @param state
+     * @return null if the fieldUnitStr could not be parsed
+     */
+    private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+
+        FieldUnit fieldUnit = null;
+
+        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
+        if(m1.matches()){
+
+            String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
+            String removal = m1.group(1);
+            if(collectorData == null){
+                collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
+                removal = m1.group(3);
+            }
+            if(collectorData == null){
+                return null;
+            }
+
+            // the fieldUnitStr is parsable
+            // remove all collectorData from the fieldUnitStr and use the rest as locality
+            String locality = fieldUnitStr.replace(removal, "");
+
+            String collectorStr = null;
+            String detailStr = null;
+            Partial date = null;
+            String fieldNumber = null;
+
+            Matcher m2 = collectionDataPattern.matcher(collectorData);
+            if(m2.matches()){
+                collectorStr = m2.group("collector");
+                detailStr = m2.group("detail");
+
+                // Try to make sense of the detailStr
+                if(detailStr != null){
+                    detailStr = detailStr.trim();
+                    // 1. try to parse as date
+                    date = parseDate(regNumber, detailStr);
+                    if(date == null){
+                        // 2. try to parse as number
+                        if(collectorsNumber.matcher(detailStr).matches()){
+                            fieldNumber = detailStr;
+                        }
+                    }
+                }
+                if(date == null && fieldNumber == null){
+                    // detailed parsing not possible, so need fo fallback
+                    collectorStr = collectorData;
+                }
+            }
+
+            if(collectorStr == null) {
+                collectorStr = collectorData;
+            }
+
+            fieldUnit = FieldUnit.NewInstance();
+            GatheringEvent ge = GatheringEvent.NewInstance();
+            ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
+
+            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
+            if(agent == null) {
+                agent = Person.NewTitledInstance(collectorStr);
+                getAgentService().save(agent);
+                state.putAgentBase(collectorStr, agent);
+            }
+            ge.setCollector(agent);
+
+            if(date != null){
+                ge.setGatheringDate(date);
+            }
+
+            getEventBaseService().save(ge);
+            fieldUnit.setGatheringEvent(ge);
+
+            if(fieldNumber != null) {
+                fieldUnit.setFieldNumber(fieldNumber);
+            }
+            getOccurrenceService().save(fieldUnit);
+
+        }
+
+        return fieldUnit;
+    }
+
+    private Partial parseDate(String regNumber, String dateStr) {
+
+        Partial pupDate = null;
+        boolean parseError = false;
+
+        String day = null;
+        String month = null;
+        String monthName = null;
+        String year = null;
+
+        for(Pattern p : datePatterns){
+            Matcher m2 = p.matcher(dateStr);
+            if(m2.matches()){
+                try {
+                    year = m2.group("year");
+                } catch (IllegalArgumentException e){
+                    // named capture group not found
+                }
+                try {
+                    month = m2.group("month");
+                } catch (IllegalArgumentException e){
+                    // named capture group not found
+                }
+
+                try {
+                    monthName = m2.group("monthName");
+                    month = monthFromName(monthName, regNumber);
+                    if(month == null){
+                        parseError = true;
+                    }
+                } catch (IllegalArgumentException e){
+                    // named capture group not found
+                }
+                try {
+                    day = m2.group("day");
+                } catch (IllegalArgumentException e){
+                    // named capture group not found
+                }
 
+                if(year != null){
+                    if (year.length() == 2) {
+                        // it is an abbreviated year from the 19** years
+                        year = "19" + year;
+                    }
+                    break;
+                } else {
+                    parseError = true;
+                }
+            }
+        }
+        if(year == null){
+            parseError = true;
+        }
+        List<DateTimeFieldType> types = new ArrayList<>();
+        List<Integer> values = new ArrayList<>();
+        if(!parseError) {
+            types.add(DateTimeFieldType.year());
+            values.add(Integer.parseInt(year));
+            if (month != null) {
+                types.add(DateTimeFieldType.monthOfYear());
+                values.add(Integer.parseInt(month));
+            }
+            if (day != null) {
+                types.add(DateTimeFieldType.dayOfMonth());
+                values.add(Integer.parseInt(day));
+            }
+            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
+        }
+        return pupDate;
     }
 
-    private String monthFromName(String monthName) {
+    private String monthFromName(String monthName, String regNumber) {
 
         Integer month = monthFromNameMap.get(monthName.toLowerCase());
         if(month == null){
-            logger.warn("Unknown month: " + monthName);
+            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
             return null;
         } else {
             return month.toString();
@@ -315,46 +613,185 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     }
 
 
-    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple){
+    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
+
         if(StringUtils.isEmpty(typeStr)){
             return;
         }
         typeStr = typeStr.trim().replaceAll("\\.$", "");
 
-        List<String> typeData = new ArrayList<>();
+        Collection collection = null;
+        DerivedUnit specimen = null;
+
+        List<DerivedUnit> specimens = new ArrayList<>();
         if(multiple){
             String[] tokens = typeStr.split("\\s?,\\s?");
             for (String t : tokens) {
+                // command to  list all complex parsabel types:
+                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
+                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
+
                 if(!t.isEmpty()){
-                    typeData.add(t.trim());
+                    // trying to parse the string
+                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
+                    if(specimen != null){
+                        specimens.add(specimen);
+                    } else {
+                        // parsing was not successful make simple specimen
+                        specimens.add(makeSpecimenType(fieldUnit, t));
+                    }
                 }
             }
         } else {
-            typeData.add(typeStr.trim());
+            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
+            if(specimen != null) {
+                specimens.add(specimen);
+                // remember current collection
+                collection = specimen.getCollection();
+            } else {
+                // parsing was not successful make simple specimen
+                specimens.add(makeSpecimenType(fieldUnit, typeStr));
+            }
         }
 
-        for(String type : typeData){
-            DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.OtherSpecimen, fieldUnit);
-            facade.setTitleCache(type, true);
-            DerivedUnit specimen = facade.innerDerivedUnit();
-            taxonName.addSpecimenTypeDesignation(specimen, typeName.status(), null, null, null, false, true);
+        for(DerivedUnit s : specimens){
+            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
        }
     }
 
-    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr,
+    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
+        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
+        facade.setTitleCache(titleCache.trim(), true);
+        specimen = facade.innerDerivedUnit();
+        return specimen;
+    }
+
+    /**
+     *
+     * @param fieldUnit
+     * @param typeName
+     * @param collection
+     * @param text
+     * @param regNumber
+     * @return
+     */
+    private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
+
+        DerivedUnit specimen = null;
+
+        String collectionCode = null;
+        String subCollectionStr = null;
+        String instituteStr = null;
+        String accessionNumber = null;
+
+        boolean unusualAccessionNumber = false;
+
+        text = text.trim();
+
+        // 1.  For Isotypes often the accession number is noted alone if the
+        //     preceeding entry has a collection code.
+        if(typeName .equals(TypesName.isotype) && collection != null){
+            Matcher m = accessionNumberOnlyPattern.matcher(text);
+            if(m.matches()){
+                try {
+                    accessionNumber = m.group("accNumber");
+                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
+                } catch (IllegalArgumentException e){
+                    // match group acc_number not found
+                }
+            }
+        }
+
+        //2. try it the 'normal' way
+        if(specimen == null) {
+            for (Pattern p : specimenTypePatterns) {
+                Matcher m = p.matcher(text);
+                if (m.matches()) {
+                    // collection code is mandatory
+                    try {
+                        collectionCode = m.group("colCode");
+                    } catch (IllegalArgumentException e){
+                        // match group colCode not found
+                    }
+                    try {
+                        subCollectionStr = m.group("subCollection");
+                    } catch (IllegalArgumentException e){
+                        // match group subCollection not found
+                    }
+                    try {
+                        instituteStr = m.group("institute");
+                    } catch (IllegalArgumentException e){
+                        // match group col_name not found
+                    }
+                    try {
+                        accessionNumber = m.group("accNumber");
+
+                        // try to improve the accessionNumber
+                        if(accessionNumber!= null) {
+                            accessionNumber = accessionNumber.trim();
+                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
+                            String betterAccessionNumber = null;
+                            if (m2.matches()) {
+                                try {
+                                    betterAccessionNumber = m.group("accNumber");
+                                } catch (IllegalArgumentException e) {
+                                    // match group acc_number not found
+                                }
+                            }
+                            if (betterAccessionNumber != null) {
+                                accessionNumber = betterAccessionNumber;
+                            } else {
+                                unusualAccessionNumber = true;
+                            }
+                        }
+
+                    } catch (IllegalArgumentException e){
+                        // match group acc_number not found
+                    }
+
+                    if(collectionCode == null && instituteStr == null){
+                        logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
+                        continue;
+                    }
+                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
+                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
+                    break;
+                }
+            }
+        }
+        if(specimen == null) {
+            logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
+        }
+        if(unusualAccessionNumber){
+            logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
+        }
+        return specimen;
+    }
+
+    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
+
+        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
+        facade.setCollection(collection);
+        if(accessionNumber != null){
+            facade.setAccessionNumber(accessionNumber);
+        }
+        return facade.innerDerivedUnit();
+    }
+
+    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
                                             String authorStr, String nomRefTitle) {
 
         BotanicalName taxonName;// cache field for the taxonName.titleCache
         String taxonNameTitleCache = null;
         Map<String, AnnotationType> nameAnnotations = new HashMap<>();
 
-        String line = state.getCurrentLine() + ": ";
-
         // TitleCache preprocessing
         if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
             nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
             titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
-            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
+            if(authorStr != null) {
+                authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
+            }
         }
 
         // parse the full taxon name
@@ -369,7 +806,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
 
         taxonNameTitleCache = taxonName.getTitleCache().trim();
         if (taxonName.isProtectedTitleCache()) {
-            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
+            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
         } else {
 
             boolean doRestoreTitleCacheStr = false;
@@ -390,17 +827,17 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
                 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
             }
             if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
-                logger.warn(line + "The generated titleCache differs from the imported string : " + taxonNameTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
+                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
                 doRestoreTitleCacheStr = true;
             }
             if (!nameCache.trim().equals(nameCompareStr)) {
-                logger.warn(line + "The parsed nameCache differs from " + NAMESTRING + " : " + nameCache + " <> " + nameCompareStr);
+                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
             }
 
             //  Author
             //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
             //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
-            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
+            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
             //    doRestoreTitleCacheStr = true;
             //}
 
@@ -458,6 +895,35 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         return rootNode;
     }
 
+    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
+
+        Collection superCollection = null;
+        if(subCollectionStr != null){
+            superCollection = getCollection(collectionCode, instituteStr, null);
+            collectionCode = subCollectionStr;
+            instituteStr = null;
+        }
+
+        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
+
+        Collection collection = collectionMap.get(key);
+
+        if(collection == null) {
+            collection = Collection.NewInstance();
+            collection.setCode(collectionCode);
+            if(instituteStr != null){
+                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
+            }
+            if(superCollection != null){
+                collection.setSuperCollection(superCollection);
+            }
+            collectionMap.put(key, collection);
+            getCollectionService().save(collection);
+        }
+
+        return collection;
+    }
+
 
     /**
      * @param record
@@ -542,7 +1008,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
        @Override
     protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
 
-        String lineNumber = state.getCurrentLine() + ": ";
+        String lineNumber = "L#" + state.getCurrentLine() + ": ";
         logger.setLevel(Level.DEBUG);
         HashMap<String, String> record = state.getOriginalRecord();
         logger.debug(lineNumber + record.toString());
@@ -621,9 +1087,9 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
            return rankFamilyIncertisSedis();
         } else if(name.matches(".*phyta$|.*mycota$")){
-           return Rank.SECTION_BOTANY();
+           return Rank.PHYLUM();
         } else if(name.matches(".*phytina$|.*mycotina$")){
-           return Rank.SUBSECTION_BOTANY();
+           return Rank.SUBPHYLUM();
         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
             return rankUnrankedSupraGeneric();
         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
@@ -703,5 +1169,15 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         return markerTypeFossil;
     }
 
+    private String csvReportLine(String regId, String message, String ... fields){
+        StringBuilder out = new StringBuilder("regID#");
+        out.append(regId).append(",\"").append(message).append('"');
+
+        for(String f : fields){
+            out.append(",\"").append(f).append('"');
+        }
+        return out.toString();
+    }
+
 
 }