ref #6026 improving specimenType parsers: better recognition of 'Coll. something...
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
index d80fc25ae8b431beda9b40c9e5e48f69c2f06af7..80d98addc6fc13bb49402e3778e91116ea7659ca 100644 (file)
@@ -10,6 +10,7 @@
 package eu.etaxonomy.cdm.io.iapt;
 
 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
+import eu.etaxonomy.cdm.api.service.pager.Pager;
 import eu.etaxonomy.cdm.common.CdmUtils;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
@@ -21,8 +22,11 @@ import eu.etaxonomy.cdm.model.name.*;
 import eu.etaxonomy.cdm.model.occurrence.*;
 import eu.etaxonomy.cdm.model.occurrence.Collection;
 import eu.etaxonomy.cdm.model.reference.Reference;
+import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
+import eu.etaxonomy.cdm.model.reference.ReferenceType;
 import eu.etaxonomy.cdm.model.taxon.*;
 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
+import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
 import org.apache.commons.lang.ArrayUtils;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.StringUtils;
@@ -74,7 +78,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private  static List<String> expectedKeys= Arrays.asList(new String[]{
             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
 
-    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
+    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
     private static final Pattern[] datePatterns = new Pattern[]{
             // NOTE:
             // The order of the patterns is extremely important!!!
@@ -84,16 +88,20 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
-            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
-            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
+            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
+            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
-            Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
+            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
         };
-    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
+    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
 
-    private static final Pattern collectorPattern =  Pattern.compile(".*?\\(leg\\.\\s+([^\\)]*)\\)|.*?\\sleg\\.\\s+(.*?)\\.?$");
+    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
+    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
+    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
+
+    private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
     private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
     private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
 
@@ -103,7 +111,8 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private static final Pattern[] specimenTypePatterns = new Pattern[]{
             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
-            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
     };
 
@@ -161,11 +170,16 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
     private Rank familyIncertisSedis = null;
     private AnnotationType annotationTypeCaveats = null;
 
+    private Reference bookVariedadesTradicionales = null;
+
+    /**
+     * HACK for unit simple testing
+     */
+    boolean _testMode = System.getProperty("TEST_MODE") != null;
+
     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
                             TaxonNode higherTaxonNode, boolean isFossil) {
 
-        String line = state.getCurrentLine() + ": ";
-
         String regNumber = getValue(record, REGISTRATIONNO_PK, false);
         String regStr = getValue(record, REGISTRATION, true);
         String titleCacheStr = getValue(record, FULLNAME, true);
@@ -185,16 +199,37 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         String nomRefTitle = null;
         String nomRefDetail;
         String nomRefPupDate = null;
+        String nomRefIssue = null;
         Partial pupDate = null;
 
+        boolean restoreOriginalReference = false;
+        boolean nameIsValid = true;
+
         // preprocess nomRef: separate citation, reference detail, publishing date
         if(!StringUtils.isEmpty(nomRefStr)){
             nomRefStr = nomRefStr.trim();
+
+            // handle the special case which is hard to parse:
+            //
+            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
+            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
+
+                if(bookVariedadesTradicionales == null){
+                    bookVariedadesTradicionales = ReferenceFactory.newBook();
+                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
+                    bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
+                    getReferenceService().save(bookVariedadesTradicionales);
+                }
+                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
+                restoreOriginalReference = true;
+            }
+
             Matcher m = nomRefTokenizeP.matcher(nomRefStr);
             if(m.matches()){
-                nomRefTitle = m.group(1);
-                nomRefDetail = m.group(2);
-                nomRefPupDate = m.group(3).trim();
+                nomRefTitle = m.group("title");
+                nomRefDetail = m.group("detail");
+                nomRefPupDate = m.group("date").trim();
+                nomRefIssue = m.group("issue");
 
                 pupDate = parseDate(regNumber, nomRefPupDate);
                 if (pupDate != null) {
@@ -216,17 +251,32 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
                         "\n -  '" + REGISTRATION  + "': " + regStr
                 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
 
+        if(restoreOriginalReference){
+            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
+        }
         if(pupDate != null) {
             taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
         }
+        if(nomRefIssue != null) {
+            ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
+        }
+
 
         if(!StringUtils.isEmpty(notesTxt)){
             notesTxt = notesTxt.replace("Notes: ", "").trim();
             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
+            nameIsValid = false;
+
         }
         if(!StringUtils.isEmpty(caveats)){
             caveats = caveats.replace("Caveats: ", "").trim();
             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
+            nameIsValid = false;
+        }
+
+        if(nameIsValid){
+            // Status is always considered valid if no notes and cavets are set
+            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
         }
 
         getNameService().save(taxonName);
@@ -271,6 +321,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         // Basionym
         if(fullBasionymStr != null){
             fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
+            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
             BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
             getNameService().save(basionym);
             taxonName.addBasionym(basionym);
@@ -287,22 +338,41 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
 
         // Types
         if(!StringUtils.isEmpty(typeStr)){
-            makeTypeData(typeStr, taxonName, regNumber, state);
+
+            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
+                makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
+            } else {
+                makeNameTypeData(typeStr, taxonName, regNumber, state);
+            }
         }
 
         getTaxonService().save(taxon);
+
+        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
+            // try to find the genus, it should have been imported already, Genera are coming first in the import file
+            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
+            if(genus != null){
+                higherTaxonNode = genus.getTaxonNodes().iterator().next();
+            } else {
+                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
+            }
+        }
+
         if(higherTaxonNode != null){
             higherTaxonNode.addChildTaxon(taxon, null, null);
             getTaxonNodeService().save(higherTaxonNode);
         }
 
-        return taxon;
+        if(taxonName.getRank().isGenus()){
+            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
+        }
 
+        return taxon;
     }
 
-    private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+    private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 
-        Matcher m = typeSplitPattern.matcher(typeStr);
+        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
 
         if(m.matches()){
             String fieldUnitStr = m.group(TypesName.fieldUnit.name());
@@ -310,7 +380,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
             if(fieldUnit == null) {
                 // create a field unit with only a titleCache using the fieldUnitStr substring
-                logger.warn(csvReportLine(regNumber, "Type: fielUnitStr can not be parsed", fieldUnitStr));
+                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
                 fieldUnit = FieldUnit.NewInstance();
                 fieldUnit.setTitleCache(fieldUnitStr, true);
                 getOccurrenceService().save(fieldUnit);
@@ -331,6 +401,76 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         getNameService().save(taxonName);
     }
 
+    private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+
+        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
+        if(nameStr.isEmpty()) {
+            return;
+        }
+
+        String basionymNameStr = null;
+        String noteStr = null;
+        String agentStr = null;
+
+        Matcher m;
+
+        if(typeStr.startsWith("not to be indicated")){
+            // Special case:
+            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
+            // FIXME
+            m = typeNameSpecialSplitPattern.matcher(nameStr);
+            if(m.matches()){
+                nameStr = m.group("name");
+                noteStr = m.group("note");
+                agentStr = m.group("agent");
+                // TODO better import of agent?
+                if(agentStr != null){
+                    noteStr = noteStr + ": " + agentStr;
+                }
+            }
+        } else {
+            // Generic case
+            m = typeNameBasionymPattern.matcher(nameStr);
+            if (m.find()) {
+                basionymNameStr = m.group("basionymName");
+                if (basionymNameStr != null) {
+                    nameStr = nameStr.replace(m.group(0), "");
+                }
+            }
+
+            m = typeNameNotePattern.matcher(nameStr);
+            if (m.find()) {
+                noteStr = m.group(1);
+                if (noteStr != null) {
+                    nameStr = nameStr.replace(m.group(0), "");
+                }
+            }
+        }
+
+        BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+
+        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
+            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
+        }
+
+        if(basionymNameStr != null){
+            BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+            getNameService().save(basionymName);
+            typeName.addBasionym(basionymName);
+        }
+
+
+        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
+        nameTypeDesignation.setTypeName(typeName);
+        getNameService().save(typeName);
+
+        if(noteStr != null){
+            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
+        }
+        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
+
+    }
+
     /**
      * Currently only parses the collector, fieldNumber and the collection date.
      *
@@ -345,20 +485,27 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
 
         Matcher m1 = collectorPattern.matcher(fieldUnitStr);
         if(m1.matches()){
-            String collectionData = m1.group(1); // like (leg. Metzeltin, 30. 9. 1996)
-            if(collectionData == null){
-                collectionData = m1.group(2); // like leg. Metzeltin, 30. 9. 1996
+
+            String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
+            String removal = m1.group(1);
+            if(collectorData == null){
+                collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
+                removal = m1.group(3);
             }
-            if(collectionData == null){
+            if(collectorData == null){
                 return null;
             }
 
+            // the fieldUnitStr is parsable
+            // remove all collectorData from the fieldUnitStr and use the rest as locality
+            String locality = fieldUnitStr.replace(removal, "");
+
             String collectorStr = null;
             String detailStr = null;
             Partial date = null;
             String fieldNumber = null;
 
-            Matcher m2 = collectionDataPattern.matcher(collectionData);
+            Matcher m2 = collectionDataPattern.matcher(collectorData);
             if(m2.matches()){
                 collectorStr = m2.group("collector");
                 detailStr = m2.group("detail");
@@ -377,40 +524,44 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
                 }
                 if(date == null && fieldNumber == null){
                     // detailed parsing not possible, so need fo fallback
-                    collectorStr = collectionData;
+                    collectorStr = collectorData;
                 }
             }
 
-            if(collectorStr != null) {
-                fieldUnit = FieldUnit.NewInstance();
-                GatheringEvent ge = GatheringEvent.NewInstance();
+            if(collectorStr == null) {
+                collectorStr = collectorData;
+            }
 
-                TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
-                if(agent == null) {
-                    agent = Person.NewTitledInstance(collectorStr);
-                    getAgentService().save(agent);
-                    state.putAgentBase(collectorStr, agent);
-                }
-                ge.setCollector(agent);
+            fieldUnit = FieldUnit.NewInstance();
+            GatheringEvent ge = GatheringEvent.NewInstance();
+            ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
 
-                if(date != null){
-                    ge.setGatheringDate(date);
-                }
+            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
+            if(agent == null) {
+                agent = Person.NewTitledInstance(collectorStr);
+                getAgentService().save(agent);
+                state.putAgentBase(collectorStr, agent);
+            }
+            ge.setCollector(agent);
 
-                getEventBaseService().save(ge);
-                fieldUnit.setGatheringEvent(ge);
+            if(date != null){
+                ge.setGatheringDate(date);
+            }
 
-                if(fieldNumber != null) {
-                    fieldUnit.setFieldNumber(fieldNumber);
-                }
-                getOccurrenceService().save(fieldUnit);
+            getEventBaseService().save(ge);
+            fieldUnit.setGatheringEvent(ge);
+
+            if(fieldNumber != null) {
+                fieldUnit.setFieldNumber(fieldNumber);
             }
+            getOccurrenceService().save(fieldUnit);
+
         }
 
         return fieldUnit;
     }
 
-    private Partial parseDate(String regNumber, String dateStr) {
+    protected Partial parseDate(String regNumber, String dateStr) {
 
         Partial pupDate = null;
         boolean parseError = false;
@@ -555,11 +706,12 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
      * @param regNumber
      * @return
      */
-    private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
+    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 
         DerivedUnit specimen = null;
 
         String collectionCode = null;
+        String collectionTitle = null;
         String subCollectionStr = null;
         String instituteStr = null;
         String accessionNumber = null;
@@ -587,21 +739,23 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             for (Pattern p : specimenTypePatterns) {
                 Matcher m = p.matcher(text);
                 if (m.matches()) {
-                    // collection code is mandatory
+                    // collection code or collectionTitle is mandatory
                     try {
                         collectionCode = m.group("colCode");
                     } catch (IllegalArgumentException e){
                         // match group colCode not found
                     }
+
                     try {
-                        subCollectionStr = m.group("subCollection");
+                        instituteStr = m.group("institute");
                     } catch (IllegalArgumentException e){
-                        // match group subCollection not found
+                        // match group col_name not found
                     }
+
                     try {
-                        instituteStr = m.group("institute");
+                        subCollectionStr = m.group("subCollection");
                     } catch (IllegalArgumentException e){
-                        // match group col_name not found
+                        // match group subCollection not found
                     }
                     try {
                         accessionNumber = m.group("accNumber");
@@ -734,8 +888,12 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
             for(String text : nameAnnotations.keySet()){
                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
             }
-            getNameService().save(taxonName);
         }
+
+        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
+
+        getNameService().save(taxonName);
+
         return taxonName;
     }
 
@@ -798,7 +956,9 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
                 collection.setSuperCollection(superCollection);
             }
             collectionMap.put(key, collection);
-            getCollectionService().save(collection);
+            if(!_testMode) {
+                getCollectionService().save(collection);
+            }
         }
 
         return collection;
@@ -878,6 +1038,15 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         value = StringUtils.replace(value, "c$k", "č");
         value = StringUtils.replace(value, " U$K", " Š");
 
+        value = StringUtils.replace(value, "O>U>!", "Ø");
+        value = StringUtils.replace(value, "o>!", "ø");
+        value = StringUtils.replace(value, "S$K", "Ŝ");
+        value = StringUtils.replace(value, ">l", "ğ");
+
+        value = StringUtils.replace(value, "§B>i", "ł");
+
+
+
         return value;
     }
 
@@ -920,6 +1089,7 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         ((IAPTImportState)state).setCurrentTaxon(taxon);
 
 
+        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
                return;
     }
 
@@ -967,9 +1137,9 @@ public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends Simp
         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
            return rankFamilyIncertisSedis();
         } else if(name.matches(".*phyta$|.*mycota$")){
-           return Rank.SECTION_BOTANY();
+           return Rank.PHYLUM();
         } else if(name.matches(".*phytina$|.*mycotina$")){
-           return Rank.SUBSECTION_BOTANY();
+           return Rank.SUBPHYLUM();
         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
             return rankUnrankedSupraGeneric();
         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){