app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java

   1 /**
   2  * Copyright (C) 2007 EDIT
   3  * European Distributed Institute of Taxonomy
   4  * http://www.e-taxonomy.eu
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version 1.1
   7  * See LICENSE.TXT at the top of this package for the full license terms.
   8  */
   9
  10 package eu.etaxonomy.cdm.io.iapt;
  11
  12 import com.fasterxml.jackson.core.JsonProcessingException;
  13 import com.fasterxml.jackson.databind.ObjectMapper;
  14 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
  15 import eu.etaxonomy.cdm.common.CdmUtils;
  16 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
  17 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
  18 import eu.etaxonomy.cdm.model.agent.Institution;
  19 import eu.etaxonomy.cdm.model.agent.Person;
  20 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
  21 import eu.etaxonomy.cdm.model.common.*;
  22 import eu.etaxonomy.cdm.model.name.*;
  23 import eu.etaxonomy.cdm.model.occurrence.*;
  24 import eu.etaxonomy.cdm.model.occurrence.Collection;
  25 import eu.etaxonomy.cdm.model.reference.Reference;
  26 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  27 import eu.etaxonomy.cdm.model.taxon.*;
  28 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  29 import org.apache.commons.lang.ArrayUtils;
  30 import org.apache.commons.lang.StringEscapeUtils;
  31 import org.apache.commons.lang.StringUtils;
  32 import org.apache.log4j.Level;
  33 import org.apache.log4j.Logger;
  34 import org.joda.time.DateTimeFieldType;
  35 import org.joda.time.Partial;
  36 import org.joda.time.format.DateTimeFormat;
  37 import org.joda.time.format.DateTimeFormatter;
  38 import org.springframework.stereotype.Component;
  39
  40 import java.util.*;
  41 import java.util.regex.Matcher;
  42 import java.util.regex.Pattern;
  43
  44 /**
  45  * @author a.mueller
  46  * @created 05.01.2016
  47  */
  48
  49 @Component("iAPTExcelImport")
  50 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
  51     private static final long serialVersionUID = -747486709409732371L;
  52     private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
  53     public static final String ANNOTATION_MARKER_STRING = "[*]";
  54
  55
  56     private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
  57
  58     private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
  59
  60     private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
  61     private final static String HIGHERTAXON= "HigherTaxon";
  62     private final static String FULLNAME= "FullName";
  63     private final static String AUTHORSSPELLING= "AuthorsSpelling";
  64     private final static String LITSTRING= "LitString";
  65     private final static String REGISTRATION= "Registration";
  66     private final static String TYPE= "Type";
  67     private final static String CAVEATS= "Caveats";
  68     private final static String FULLBASIONYM= "FullBasionym";
  69     private final static String FULLSYNSUBST= "FullSynSubst";
  70     private final static String NOTESTXT= "NotesTxt";
  71     private final static String REGDATE= "RegDate";
  72     private final static String NAMESTRING= "NameString";
  73     private final static String BASIONYMSTRING= "BasionymString";
  74     private final static String SYNSUBSTSTR= "SynSubstStr";
  75     private final static String AUTHORSTRING= "AuthorString";
  76
  77     private  static List<String> expectedKeys= Arrays.asList(new String[]{
  78             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
  79
  80     private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
  81     private static final Pattern[] datePatterns = new Pattern[]{
  82             // NOTE:
  83             // The order of the patterns is extremely important!!!
  84             //
  85             // all patterns cover the years 1700 - 1999
  86             Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
  87             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
  88             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
  89             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
  90             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
  91             Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
  92             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
  93             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
  94             Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
  95             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
  96         };
  97     protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
  98
  99     private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
 100     private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
 101     private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
 102
 103     protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
 104     private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
 105     private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
 106
 107     // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
 108     private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
 109
 110     private static final Pattern[] specimenTypePatterns = new Pattern[]{
 111             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
 112             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
 113             Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
 114             Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
 115             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
 116     };
 117
 118
 119     private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
 120
 121     private static Map<String, Integer> monthFromNameMap = new HashMap<>();
 122
 123     static {
 124         String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
 125         String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
 126         String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
 127         String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
 128         String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
 129         String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
 130         String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
 131         String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
 132         String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
 133         String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
 134
 135         String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
 136
 137         for (String[] months: perLang) {
 138             for(int m = 1; m < 13; m++){
 139                 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
 140             }
 141         }
 142
 143         // special cases
 144         monthFromNameMap.put("mar", 3);
 145         monthFromNameMap.put("dec", 12);
 146         monthFromNameMap.put("februari", 2);
 147         monthFromNameMap.put("març", 3);
 148     }
 149
 150
 151     DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
 152
 153     private Map<String, Collection> collectionMap = new HashMap<>();
 154     private ExtensionType extensionTypeIAPTRegData = null;
 155
 156
 157     enum TypesName {
 158         fieldUnit, holotype, isotype;
 159
 160         public SpecimenTypeDesignationStatus status(){
 161             switch (this) {
 162                 case holotype:
 163                     return SpecimenTypeDesignationStatus.HOLOTYPE();
 164                 case isotype:
 165                     return SpecimenTypeDesignationStatus.ISOTYPE();
 166                 default:
 167                     return null;
 168             }
 169         }
 170     }
 171
 172     private MarkerType markerTypeFossil = null;
 173     private Rank rankUnrankedSupraGeneric = null;
 174     private Rank familyIncertisSedis = null;
 175     private AnnotationType annotationTypeCaveats = null;
 176
 177     private Reference bookVariedadesTradicionales = null;
 178
 179     /**
 180      * HACK for unit simple testing
 181      */
 182     boolean _testMode = System.getProperty("TEST_MODE") != null;
 183
 184     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
 185                             TaxonNode higherTaxonNode, boolean isFossil) {
 186
 187         String regNumber = getValue(record, REGISTRATIONNO_PK, false);
 188         String regStr = getValue(record, REGISTRATION, true);
 189         String titleCacheStr = getValue(record, FULLNAME, true);
 190         String nameStr = getValue(record, NAMESTRING, true);
 191         String authorStr = getValue(record, AUTHORSTRING, true);
 192         String nomRefStr = getValue(record, LITSTRING, true);
 193         String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
 194         String notesTxt = getValue(record, NOTESTXT, true);
 195         String caveats = getValue(record, CAVEATS, true);
 196         String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
 197         String fullBasionymStr = getValue(record, FULLBASIONYM, true);
 198         String basionymNameStr = getValue(record, FULLBASIONYM, true);
 199         String synSubstStr = getValue(record, SYNSUBSTSTR, true);
 200         String typeStr = getValue(record, TYPE, true);
 201
 202
 203         String nomRefTitle = null;
 204         String nomRefDetail;
 205         String nomRefPupDate = null;
 206         String nomRefIssue = null;
 207         Partial pupDate = null;
 208
 209         boolean restoreOriginalReference = false;
 210         boolean nameIsValid = true;
 211
 212         // preprocess nomRef: separate citation, reference detail, publishing date
 213         if(!StringUtils.isEmpty(nomRefStr)){
 214             nomRefStr = nomRefStr.trim();
 215
 216             // handle the special case which is hard to parse:
 217             //
 218             // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
 219             if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
 220
 221                 if(bookVariedadesTradicionales == null){
 222                     bookVariedadesTradicionales = ReferenceFactory.newBook();
 223                     bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
 224                     bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
 225                     getReferenceService().save(bookVariedadesTradicionales);
 226                 }
 227                 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
 228                 restoreOriginalReference = true;
 229             }
 230
 231             Matcher m = nomRefTokenizeP.matcher(nomRefStr);
 232             if(m.matches()){
 233                 nomRefTitle = m.group("title");
 234                 nomRefDetail = m.group("detail");
 235                 nomRefPupDate = m.group("date").trim();
 236                 nomRefIssue = m.group("issue");
 237
 238                 pupDate = parseDate(regNumber, nomRefPupDate);
 239                 if (pupDate != null) {
 240                     nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
 241                 } else {
 242                     logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
 243                 }
 244             } else {
 245                 nomRefTitle = nomRefStr;
 246             }
 247         }
 248
 249         BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
 250
 251         // always add the original strings of parsed data as annotation
 252         taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
 253                         "\n -  '" + LITSTRING + "': "+ nomRefStr +
 254                         "\n -  '" + TYPE + "': " + typeStr +
 255                         "\n -  '" + REGISTRATION  + "': " + regStr
 256                 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
 257
 258         if(restoreOriginalReference){
 259             taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
 260         }
 261         if(pupDate != null) {
 262             taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
 263         }
 264         if(nomRefIssue != null) {
 265             ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
 266         }
 267
 268
 269         if(!StringUtils.isEmpty(notesTxt)){
 270             notesTxt = notesTxt.replace("Notes: ", "").trim();
 271             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
 272             nameIsValid = false;
 273
 274         }
 275         if(!StringUtils.isEmpty(caveats)){
 276             caveats = caveats.replace("Caveats: ", "").trim();
 277             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
 278             nameIsValid = false;
 279         }
 280
 281         if(nameIsValid){
 282             // Status is always considered valid if no notes and cavets are set
 283             taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
 284         }
 285
 286         getNameService().save(taxonName);
 287
 288         // Namerelations
 289         if(!StringUtils.isEmpty(authorsSpelling)){
 290             authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
 291
 292             String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
 293             String[] nameStrTokens = StringUtils.split(nameStr, " ");
 294
 295             ArrayUtils.reverse(authorSpellingTokens);
 296             ArrayUtils.reverse(nameStrTokens);
 297
 298             for (int i = 0; i < nameStrTokens.length; i++){
 299                 if(i < authorSpellingTokens.length){
 300                     nameStrTokens[i] = authorSpellingTokens[i];
 301                 }
 302             }
 303             ArrayUtils.reverse(nameStrTokens);
 304
 305             String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
 306             // build the fullnameString of the misspelled name
 307             misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
 308
 309             TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
 310             misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
 311             getNameService().save(misspelledName);
 312         }
 313
 314         // Replaced Synonyms
 315         if(!StringUtils.isEmpty(fullSynSubstStr)){
 316             fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
 317             BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
 318             replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
 319             getNameService().save(replacedSynonymName);
 320         }
 321
 322         Reference sec = state.getConfig().getSecReference();
 323         Taxon taxon = Taxon.NewInstance(taxonName, sec);
 324
 325         // Basionym
 326         if(fullBasionymStr != null){
 327             fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 328             basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 329             BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
 330             getNameService().save(basionym);
 331             taxonName.addBasionym(basionym);
 332
 333             Synonym syn = Synonym.NewInstance(basionym, sec);
 334             taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
 335             getTaxonService().save(syn);
 336         }
 337
 338         // Markers
 339         if(isFossil){
 340             taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
 341         }
 342
 343         // Types
 344         if(!StringUtils.isEmpty(typeStr)){
 345
 346             if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
 347                 makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
 348             } else {
 349                 makeNameTypeData(typeStr, taxonName, regNumber, state);
 350             }
 351         }
 352
 353         getTaxonService().save(taxon);
 354
 355         if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
 356             // try to find the genus, it should have been imported already, Genera are coming first in the import file
 357             Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
 358             if(genus != null){
 359                 higherTaxonNode = genus.getTaxonNodes().iterator().next();
 360             } else {
 361                 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
 362             }
 363         }
 364
 365         if(higherTaxonNode != null){
 366             higherTaxonNode.addChildTaxon(taxon, null, null);
 367             getTaxonNodeService().save(higherTaxonNode);
 368         }
 369
 370         if(taxonName.getRank().isGenus()){
 371             ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
 372         }
 373
 374         return taxon;
 375     }
 376
 377     private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
 378
 379         Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
 380
 381         if(m.matches()){
 382             String fieldUnitStr = m.group(TypesName.fieldUnit.name());
 383             // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
 384             FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
 385             if(fieldUnit == null) {
 386                 // create a field unit with only a titleCache using the fieldUnitStr substring
 387                 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
 388                 fieldUnit = FieldUnit.NewInstance();
 389                 fieldUnit.setTitleCache(fieldUnitStr, true);
 390                 getOccurrenceService().save(fieldUnit);
 391             }
 392             getOccurrenceService().save(fieldUnit);
 393
 394             SpecimenOrObservationType specimenType;
 395             if(isFossil){
 396                 specimenType = SpecimenOrObservationType.Fossil;
 397             } else {
 398                 specimenType = SpecimenOrObservationType.PreservedSpecimen;
 399             }
 400
 401             // all others ..
 402             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
 403             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
 404
 405         } else {
 406             // create a field unit with only a titleCache using the full typeStr
 407             FieldUnit fieldUnit = FieldUnit.NewInstance();
 408             fieldUnit.setTitleCache(typeStr, true);
 409             getOccurrenceService().save(fieldUnit);
 410             logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
 411         }
 412         getNameService().save(taxonName);
 413     }
 414
 415     private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 416
 417         String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
 418         if(nameStr.isEmpty()) {
 419             return;
 420         }
 421
 422         String basionymNameStr = null;
 423         String noteStr = null;
 424         String agentStr = null;
 425
 426         Matcher m;
 427
 428         if(typeStr.startsWith("not to be indicated")){
 429             // Special case:
 430             // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
 431             // FIXME
 432             m = typeNameSpecialSplitPattern.matcher(nameStr);
 433             if(m.matches()){
 434                 nameStr = m.group("name");
 435                 noteStr = m.group("note");
 436                 agentStr = m.group("agent");
 437                 // TODO better import of agent?
 438                 if(agentStr != null){
 439                     noteStr = noteStr + ": " + agentStr;
 440                 }
 441             }
 442         } else {
 443             // Generic case
 444             m = typeNameBasionymPattern.matcher(nameStr);
 445             if (m.find()) {
 446                 basionymNameStr = m.group("basionymName");
 447                 if (basionymNameStr != null) {
 448                     nameStr = nameStr.replace(m.group(0), "");
 449                 }
 450             }
 451
 452             m = typeNameNotePattern.matcher(nameStr);
 453             if (m.find()) {
 454                 noteStr = m.group(1);
 455                 if (noteStr != null) {
 456                     nameStr = nameStr.replace(m.group(0), "");
 457                 }
 458             }
 459         }
 460
 461         BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 462
 463         if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
 464             logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
 465         }
 466
 467         if(basionymNameStr != null){
 468             BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 469             getNameService().save(basionymName);
 470             typeName.addBasionym(basionymName);
 471         }
 472
 473
 474         NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
 475         nameTypeDesignation.setTypeName(typeName);
 476         getNameService().save(typeName);
 477
 478         if(noteStr != null){
 479             nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
 480         }
 481         taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
 482
 483     }
 484
 485     /**
 486      * Currently only parses the collector, fieldNumber and the collection date.
 487      *
 488      * @param fieldUnitStr
 489      * @param regNumber
 490      * @param state
 491      * @return null if the fieldUnitStr could not be parsed
 492      */
 493     protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 494
 495         FieldUnit fieldUnit = null;
 496
 497         Matcher m1 = collectorPattern.matcher(fieldUnitStr);
 498         if(m1.matches()){
 499
 500             String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
 501             String removal = m1.group(1);
 502             if(collectorData == null){
 503                 collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
 504                 removal = m1.group(3);
 505             }
 506             if(collectorData == null){
 507                 collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
 508                 removal = null;
 509             }
 510             if(collectorData == null){
 511                 return null;
 512             }
 513
 514             // the fieldUnitStr is parsable
 515             // remove all collectorData from the fieldUnitStr and use the rest as locality
 516             String locality = null;
 517             if(removal != null){
 518                 locality = fieldUnitStr.replace(removal, "");
 519             }
 520
 521             String collectorStr = null;
 522             String detailStr = null;
 523             Partial date = null;
 524             String fieldNumber = null;
 525
 526             Matcher m2 = collectionDataPattern.matcher(collectorData);
 527             if(m2.matches()){
 528                 collectorStr = m2.group("collector");
 529                 detailStr = m2.group("detail");
 530
 531                 // Try to make sense of the detailStr
 532                 if(detailStr != null){
 533                     detailStr = detailStr.trim();
 534                     // 1. try to parse as date
 535                     date = parseDate(regNumber, detailStr);
 536                     if(date == null){
 537                         // 2. try to parse as number
 538                         if(collectorsNumber.matcher(detailStr).matches()){
 539                             fieldNumber = detailStr;
 540                         }
 541                     }
 542                 }
 543                 if(date == null && fieldNumber == null){
 544                     // detailed parsing not possible, so need fo fallback
 545                     collectorStr = collectorData;
 546                 }
 547             }
 548
 549             if(collectorStr == null) {
 550                 collectorStr = collectorData;
 551             }
 552
 553             fieldUnit = FieldUnit.NewInstance();
 554             GatheringEvent ge = GatheringEvent.NewInstance();
 555             if(locality != null){
 556                 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
 557             }
 558
 559             TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
 560             if(agent == null) {
 561                 agent = Person.NewTitledInstance(collectorStr);
 562                 getAgentService().save(agent);
 563                 state.putAgentBase(collectorStr, agent);
 564             }
 565             ge.setCollector(agent);
 566
 567             if(date != null){
 568                 ge.setGatheringDate(date);
 569             }
 570
 571             getEventBaseService().save(ge);
 572             fieldUnit.setGatheringEvent(ge);
 573
 574             if(fieldNumber != null) {
 575                 fieldUnit.setFieldNumber(fieldNumber);
 576             }
 577             getOccurrenceService().save(fieldUnit);
 578
 579         }
 580
 581         return fieldUnit;
 582     }
 583
 584     protected Partial parseDate(String regNumber, String dateStr) {
 585
 586         Partial pupDate = null;
 587         boolean parseError = false;
 588
 589         String day = null;
 590         String month = null;
 591         String monthName = null;
 592         String year = null;
 593
 594         for(Pattern p : datePatterns){
 595             Matcher m2 = p.matcher(dateStr);
 596             if(m2.matches()){
 597                 try {
 598                     year = m2.group("year");
 599                 } catch (IllegalArgumentException e){
 600                     // named capture group not found
 601                 }
 602                 try {
 603                     month = m2.group("month");
 604                 } catch (IllegalArgumentException e){
 605                     // named capture group not found
 606                 }
 607
 608                 try {
 609                     monthName = m2.group("monthName");
 610                     month = monthFromName(monthName, regNumber);
 611                     if(month == null){
 612                         parseError = true;
 613                     }
 614                 } catch (IllegalArgumentException e){
 615                     // named capture group not found
 616                 }
 617                 try {
 618                     day = m2.group("day");
 619                 } catch (IllegalArgumentException e){
 620                     // named capture group not found
 621                 }
 622
 623                 if(year != null){
 624                     if (year.length() == 2) {
 625                         // it is an abbreviated year from the 19** years
 626                         year = "19" + year;
 627                     }
 628                     break;
 629                 } else {
 630                     parseError = true;
 631                 }
 632             }
 633         }
 634         if(year == null){
 635             parseError = true;
 636         }
 637         List<DateTimeFieldType> types = new ArrayList<>();
 638         List<Integer> values = new ArrayList<>();
 639         if(!parseError) {
 640             types.add(DateTimeFieldType.year());
 641             values.add(Integer.parseInt(year));
 642             if (month != null) {
 643                 types.add(DateTimeFieldType.monthOfYear());
 644                 values.add(Integer.parseInt(month));
 645             }
 646             if (day != null) {
 647                 types.add(DateTimeFieldType.dayOfMonth());
 648                 values.add(Integer.parseInt(day));
 649             }
 650             pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
 651         }
 652         return pupDate;
 653     }
 654
 655     private String monthFromName(String monthName, String regNumber) {
 656
 657         Integer month = monthFromNameMap.get(monthName.toLowerCase());
 658         if(month == null){
 659             logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
 660             return null;
 661         } else {
 662             return month.toString();
 663         }
 664     }
 665
 666
 667     private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
 668
 669         if(StringUtils.isEmpty(typeStr)){
 670             return;
 671         }
 672         typeStr = typeStr.trim().replaceAll("\\.$", "");
 673
 674         Collection collection = null;
 675         DerivedUnit specimen = null;
 676
 677         List<DerivedUnit> specimens = new ArrayList<>();
 678         if(multiple){
 679             String[] tokens = typeStr.split("\\s?,\\s?");
 680             for (String t : tokens) {
 681                 // command to  list all complex parsabel types:
 682                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
 683                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
 684
 685                 if(!t.isEmpty()){
 686                     // trying to parse the string
 687                     specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
 688                     if(specimen != null){
 689                         specimens.add(specimen);
 690                     } else {
 691                         // parsing was not successful make simple specimen
 692                         specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
 693                     }
 694                 }
 695             }
 696         } else {
 697             specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
 698             if(specimen != null) {
 699                 specimens.add(specimen);
 700                 // remember current collection
 701                 collection = specimen.getCollection();
 702             } else {
 703                 // parsing was not successful make simple specimen
 704                 specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
 705             }
 706         }
 707
 708         for(DerivedUnit s : specimens){
 709             taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
 710        }
 711     }
 712
 713     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
 714         DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
 715         facade.setTitleCache(titleCache.trim(), true);
 716         specimen = facade.innerDerivedUnit();
 717         return specimen;
 718     }
 719
 720     /**
 721      *
 722      * @param fieldUnit
 723      * @param typeName
 724      * @param collection
 725      * @param text
 726      * @param regNumber
 727      * @return
 728      */
 729     protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 730
 731         DerivedUnit specimen = null;
 732
 733         String collectionCode = null;
 734         String collectionTitle = null;
 735         String subCollectionStr = null;
 736         String instituteStr = null;
 737         String accessionNumber = null;
 738
 739         boolean unusualAccessionNumber = false;
 740
 741         text = text.trim();
 742
 743         // 1.  For Isotypes often the accession number is noted alone if the
 744         //     preceeding entry has a collection code.
 745         if(typeName .equals(TypesName.isotype) && collection != null){
 746             Matcher m = accessionNumberOnlyPattern.matcher(text);
 747             if(m.matches()){
 748                 try {
 749                     accessionNumber = m.group("accNumber");
 750                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 751                 } catch (IllegalArgumentException e){
 752                     // match group acc_number not found
 753                 }
 754             }
 755         }
 756
 757         //2. try it the 'normal' way
 758         if(specimen == null) {
 759             for (Pattern p : specimenTypePatterns) {
 760                 Matcher m = p.matcher(text);
 761                 if (m.matches()) {
 762                     // collection code or collectionTitle is mandatory
 763                     try {
 764                         collectionCode = m.group("colCode");
 765                     } catch (IllegalArgumentException e){
 766                         // match group colCode not found
 767                     }
 768
 769                     try {
 770                         instituteStr = m.group("institute");
 771                     } catch (IllegalArgumentException e){
 772                         // match group col_name not found
 773                     }
 774
 775                     try {
 776                         subCollectionStr = m.group("subCollection");
 777                     } catch (IllegalArgumentException e){
 778                         // match group subCollection not found
 779                     }
 780                     try {
 781                         accessionNumber = m.group("accNumber");
 782
 783                         // try to improve the accessionNumber
 784                         if(accessionNumber!= null) {
 785                             accessionNumber = accessionNumber.trim();
 786                             Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
 787                             String betterAccessionNumber = null;
 788                             if (m2.matches()) {
 789                                 try {
 790                                     betterAccessionNumber = m.group("accNumber");
 791                                 } catch (IllegalArgumentException e) {
 792                                     // match group acc_number not found
 793                                 }
 794                             }
 795                             if (betterAccessionNumber != null) {
 796                                 accessionNumber = betterAccessionNumber;
 797                             } else {
 798                                 unusualAccessionNumber = true;
 799                             }
 800                         }
 801
 802                     } catch (IllegalArgumentException e){
 803                         // match group acc_number not found
 804                     }
 805
 806                     if(collectionCode == null && instituteStr == null){
 807                         logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
 808                         continue;
 809                     }
 810                     collection = getCollection(collectionCode, instituteStr, subCollectionStr);
 811                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 812                     break;
 813                 }
 814             }
 815         }
 816         if(specimen == null) {
 817             logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
 818         }
 819         if(unusualAccessionNumber){
 820             logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
 821         }
 822         return specimen;
 823     }
 824
 825     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
 826
 827         DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 828         facade.setCollection(collection);
 829         if(accessionNumber != null){
 830             facade.setAccessionNumber(accessionNumber);
 831         }
 832         return facade.innerDerivedUnit();
 833     }
 834
 835     private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
 836                                             String authorStr, String nomRefTitle) {
 837
 838         BotanicalName taxonName;// cache field for the taxonName.titleCache
 839         String taxonNameTitleCache = null;
 840         Map<String, AnnotationType> nameAnnotations = new HashMap<>();
 841
 842         // TitleCache preprocessing
 843         if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
 844             nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
 845             titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 846             if(authorStr != null) {
 847                 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 848             }
 849         }
 850
 851         // parse the full taxon name
 852         if(!StringUtils.isEmpty(nomRefTitle)){
 853             String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
 854             String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
 855             logger.debug(":::::" + taxonFullNameStr);
 856             taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
 857         } else {
 858             taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
 859         }
 860
 861         taxonNameTitleCache = taxonName.getTitleCache().trim();
 862         if (taxonName.isProtectedTitleCache()) {
 863             logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
 864         } else {
 865
 866             boolean doRestoreTitleCacheStr = false;
 867
 868             // Check if titleCache and nameCache are plausible
 869             String titleCacheCompareStr = titleCacheStr;
 870             String nameCache = taxonName.getNameCache();
 871             String nameCompareStr = nameStr;
 872             if(taxonName.isBinomHybrid()){
 873                 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
 874                 nameCompareStr = nameCompareStr.replace(" x ", " ×");
 875             }
 876             if(taxonName.isMonomHybrid()){
 877                 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
 878                 nameCompareStr = nameCompareStr.replace("^X ", "× ");
 879             }
 880             if(authorStr != null && authorStr.contains(" et ")){
 881                 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
 882             }
 883             if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
 884                 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
 885                 doRestoreTitleCacheStr = true;
 886             }
 887             if (!nameCache.trim().equals(nameCompareStr)) {
 888                 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
 889             }
 890
 891             //  Author
 892             //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
 893             //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
 894             //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
 895             //    doRestoreTitleCacheStr = true;
 896             //}
 897
 898             if(doRestoreTitleCacheStr){
 899                 taxonName.setTitleCache(titleCacheStr, true);
 900             }
 901
 902             // deduplicate
 903             replaceAuthorNamesAndNomRef(state, taxonName);
 904         }
 905
 906         // Annotations
 907         if(!nameAnnotations.isEmpty()){
 908             for(String text : nameAnnotations.keySet()){
 909                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
 910             }
 911         }
 912
 913         taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
 914
 915         getNameService().save(taxonName);
 916
 917         return taxonName;
 918     }
 919
 920     /**
 921      * @param state
 922      * @return
 923      */
 924     private TaxonNode getClassificationRootNode(IAPTImportState state) {
 925
 926      //   Classification classification = state.getClassification();
 927      //   if (classification == null){
 928      //       IAPTImportConfigurator config = state.getConfig();
 929      //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
 930      //       classification.setUuid(config.getClassificationUuid());
 931      //       classification.setReference(config.getSecReference());
 932      //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
 933      //   }
 934         TaxonNode rootNode = state.getRootNode();
 935         if (rootNode == null){
 936             rootNode = getTaxonNodeService().find(ROOT_UUID);
 937         }
 938         if (rootNode == null){
 939             Classification classification = state.getClassification();
 940             if (classification == null){
 941                 Reference sec = state.getSecReference();
 942                 String classificationName = state.getConfig().getClassificationName();
 943                 Language language = Language.DEFAULT();
 944                 classification = Classification.NewInstance(classificationName, sec, language);
 945                 state.setClassification(classification);
 946                 classification.setUuid(state.getConfig().getClassificationUuid());
 947                 classification.getRootNode().setUuid(ROOT_UUID);
 948                 getClassificationService().save(classification);
 949             }
 950             rootNode = classification.getRootNode();
 951             state.setRootNode(rootNode);
 952         }
 953         return rootNode;
 954     }
 955
 956     private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
 957
 958         Collection superCollection = null;
 959         if(subCollectionStr != null){
 960             superCollection = getCollection(collectionCode, instituteStr, null);
 961             collectionCode = subCollectionStr;
 962             instituteStr = null;
 963         }
 964
 965         final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
 966
 967         Collection collection = collectionMap.get(key);
 968
 969         if(collection == null) {
 970             collection = Collection.NewInstance();
 971             collection.setCode(collectionCode);
 972             if(instituteStr != null){
 973                 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
 974             }
 975             if(superCollection != null){
 976                 collection.setSuperCollection(superCollection);
 977             }
 978             collectionMap.put(key, collection);
 979             if(!_testMode) {
 980                 getCollectionService().save(collection);
 981             }
 982         }
 983
 984         return collection;
 985     }
 986
 987
 988     /**
 989      * @param record
 990      * @param originalKey
 991      * @param doUnescapeHtmlEntities
 992      * @return
 993      */
 994     private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
 995         String value = record.get(originalKey);
 996
 997         value = fixCharacters(value);
 998
 999         if (! StringUtils.isBlank(value)) {
1000                 if (logger.isDebugEnabled()) {
1001                     logger.debug(originalKey + ": " + value);
1002                 }
1003                 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1004             if(doUnescapeHtmlEntities){
1005                 value = StringEscapeUtils.unescapeHtml(value);
1006             }
1007                 return value.trim();
1008         }else{
1009                 return null;
1010         }
1011     }
1012
1013     /**
1014      * Fixes broken characters.
1015      * For details see
1016      * http://dev.e-taxonomy.eu/redmine/issues/6035
1017      *
1018      * @param value
1019      * @return
1020      */
1021     private String fixCharacters(String value) {
1022
1023         value = StringUtils.replace(value, "s$K", "š");
1024         value = StringUtils.replace(value, "n$K", "ň");
1025         value = StringUtils.replace(value, "e$K", "ě");
1026         value = StringUtils.replace(value, "r$K", "ř");
1027         value = StringUtils.replace(value, "c$K", "č");
1028         value = StringUtils.replace(value, "z$K", "ž");
1029         value = StringUtils.replace(value, "S>U$K", "Š");
1030         value = StringUtils.replace(value, "C>U$K", "Č");
1031         value = StringUtils.replace(value, "R>U$K", "Ř");
1032         value = StringUtils.replace(value, "Z>U$K", "Ž");
1033         value = StringUtils.replace(value, "g$K", "ǧ");
1034         value = StringUtils.replace(value, "s$A", "ś");
1035         value = StringUtils.replace(value, "n$A", "ń");
1036         value = StringUtils.replace(value, "c$A", "ć");
1037         value = StringUtils.replace(value, "e$E", "ę");
1038         value = StringUtils.replace(value, "o$H", "õ");
1039         value = StringUtils.replace(value, "s$C", "ş");
1040         value = StringUtils.replace(value, "t$C", "ț");
1041         value = StringUtils.replace(value, "S>U$C", "Ş");
1042         value = StringUtils.replace(value, "a$O", "å");
1043         value = StringUtils.replace(value, "A>U$O", "Å");
1044         value = StringUtils.replace(value, "u$O", "ů");
1045         value = StringUtils.replace(value, "g$B", "ğ");
1046         value = StringUtils.replace(value, "g$B", "ĕ");
1047         value = StringUtils.replace(value, "a$B", "ă");
1048         value = StringUtils.replace(value, "l$/", "ł");
1049         value = StringUtils.replace(value, ">i", "ı");
1050         value = StringUtils.replace(value, "i$U", "ï");
1051         // Special-cases
1052         value = StringUtils.replace(value, "&yacute", "ý");
1053         value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1054         value = StringUtils.replace(value, "E>U$D", "З");
1055         value = StringUtils.replace(value, "S>U$E", "Ş");
1056         value = StringUtils.replace(value, "s$E", "ş");
1057
1058         value = StringUtils.replace(value, "c$k", "č");
1059         value = StringUtils.replace(value, " U$K", " Š");
1060
1061         value = StringUtils.replace(value, "O>U>!", "Ø");
1062         value = StringUtils.replace(value, "o>!", "ø");
1063         value = StringUtils.replace(value, "S$K", "Ŝ");
1064         value = StringUtils.replace(value, ">l", "ğ");
1065
1066         value = StringUtils.replace(value, "§B>i", "ł");
1067
1068
1069
1070         return value;
1071     }
1072
1073
1074     /**
1075          *  Stores taxa records in DB
1076          */
1077         @Override
1078     protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1079
1080         String lineNumber = "L#" + state.getCurrentLine() + ": ";
1081         logger.setLevel(Level.DEBUG);
1082         HashMap<String, String> record = state.getOriginalRecord();
1083         logger.debug(lineNumber + record.toString());
1084
1085         Set<String> keys = record.keySet();
1086         for (String key: keys) {
1087             if (! expectedKeys.contains(key)){
1088                 logger.warn(lineNumber + "Unexpected Key: " + key);
1089             }
1090         }
1091
1092         String reg_id = record.get(REGISTRATIONNO_PK);
1093
1094         //higherTaxon
1095         String higherTaxaString = record.get(HIGHERTAXON);
1096         boolean isFossil = false;
1097         if(higherTaxaString.startsWith("FOSSIL ")){
1098             higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1099             isFossil = true;
1100         }
1101         TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1102
1103        //Taxon
1104         Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1105         if (taxon == null){
1106             logger.warn(lineNumber + "taxon could not be created and is null");
1107             return;
1108         }
1109         ((IAPTImportState)state).setCurrentTaxon(taxon);
1110
1111         // Registration
1112         IAPTRegData regData = makeIAPTRegData(state);
1113         ObjectMapper mapper = new ObjectMapper();
1114         try {
1115             String regdataJson = mapper.writeValueAsString(regData);
1116             Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1117             getNameService().save(taxon.getName());
1118         } catch (JsonProcessingException e) {
1119             logger.error("Error on converting IAPTRegData", e);
1120         }
1121
1122         logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1123                 return;
1124     }
1125
1126     private ExtensionType getExtensionTypeIAPTRegData() {
1127         if(extensionTypeIAPTRegData == null){
1128             extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1129             getTermService().save(extensionTypeIAPTRegData);
1130         }
1131         return extensionTypeIAPTRegData;
1132     }
1133
1134     private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1135
1136         HashMap<String, String> record = state.getOriginalRecord();
1137         String registrationStr = getValue(record, REGISTRATION);
1138         String regDateStr = getValue(record, REGDATE);
1139         String regStr = getValue(record, REGISTRATION, true);
1140
1141         String dateStr = null;
1142         String office = null;
1143         Integer regID = null;
1144         Integer formNo = null;
1145
1146         Matcher m = registrationPattern.matcher(registrationStr);
1147         if(m.matches()){
1148             dateStr = m.group("regdate");
1149             if(parseDate( regStr, dateStr) == null){
1150                 // check for valid dates
1151                 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1152             };
1153             office = m.group("office");
1154             regID = Integer.valueOf(m.group("regid"));
1155             try {
1156                 formNo = Integer.valueOf(m.group("formNo"));
1157             } catch(IllegalArgumentException e){
1158                 // ignore
1159             }
1160         } else {
1161             logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1162         }
1163         IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1164         return regData;
1165     }
1166
1167     private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1168         String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1169         TaxonNode higherTaxonNode = null;
1170
1171         ITaxonTreeNode rootNode = getClassificationRootNode(state);
1172         for (String htn :  higherTaxaNames) {
1173             htn = StringUtils.capitalize(htn.trim());
1174             Taxon higherTaxon = state.getHigherTaxon(htn);
1175             if (higherTaxon != null){
1176                 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1177             }else{
1178                 BotanicalName name = makeHigherTaxonName(state, htn);
1179                 Reference sec = state.getSecReference();
1180                 higherTaxon = Taxon.NewInstance(name, sec);
1181                 getTaxonService().save(higherTaxon);
1182                 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1183                 state.putHigherTaxon(htn, higherTaxon);
1184                 getClassificationService().saveTreeNode(higherTaxonNode);
1185             }
1186             rootNode = higherTaxonNode;
1187         }
1188         return higherTaxonNode;
1189     }
1190
1191     private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1192
1193         Rank rank = guessRank(name);
1194
1195         BotanicalName taxonName = BotanicalName.NewInstance(rank);
1196         taxonName.addSource(makeOriginalSource(state));
1197         taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1198         return taxonName;
1199     }
1200
1201     private Rank guessRank(String name) {
1202
1203         // normalize
1204         name = name.replaceAll("\\(.*\\)", "").trim();
1205
1206         if(name.matches("^Plantae$|^Fungi$")){
1207            return Rank.KINGDOM();
1208         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1209            return rankFamilyIncertisSedis();
1210         } else if(name.matches(".*phyta$|.*mycota$")){
1211            return Rank.PHYLUM();
1212         } else if(name.matches(".*phytina$|.*mycotina$")){
1213            return Rank.SUBPHYLUM();
1214         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1215             return rankUnrankedSupraGeneric();
1216         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1217            return Rank.CLASS();
1218         } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1219            return Rank.SUBCLASS();
1220         } else if(name.matches(".*ales$")){
1221            return Rank.ORDER();
1222         } else if(name.matches(".*ineae$")){
1223            return Rank.SUBORDER();
1224         } else if(name.matches(".*aceae$")){
1225             return Rank.FAMILY();
1226         } else if(name.matches(".*oideae$")){
1227            return Rank.SUBFAMILY();
1228         } else
1229         //    if(name.matches(".*eae$")){
1230         //    return Rank.TRIBE();
1231         // } else
1232             if(name.matches(".*inae$")){
1233            return Rank.SUBTRIBE();
1234         } else if(name.matches(".*ae$")){
1235            return Rank.FAMILY();
1236         }
1237         return Rank.UNKNOWN_RANK();
1238     }
1239
1240     private Rank rankUnrankedSupraGeneric() {
1241
1242         if(rankUnrankedSupraGeneric == null){
1243             rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1244             getTermService().save(rankUnrankedSupraGeneric);
1245         }
1246         return rankUnrankedSupraGeneric;
1247     }
1248
1249     private Rank rankFamilyIncertisSedis() {
1250
1251         if(familyIncertisSedis == null){
1252             familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1253             getTermService().save(familyIncertisSedis);
1254         }
1255         return familyIncertisSedis;
1256     }
1257
1258     private AnnotationType annotationTypeCaveats(){
1259         if(annotationTypeCaveats == null){
1260             annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1261             getTermService().save(annotationTypeCaveats);
1262         }
1263         return annotationTypeCaveats;
1264     }
1265
1266
1267     /**
1268      * @param state
1269      * @return
1270      */
1271     private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1272         return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1273     }
1274
1275
1276     private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1277         Reference ref = state.getReference(uuidRef);
1278         if (ref == null){
1279             ref = getReferenceService().find(uuidRef);
1280             state.putReference(uuidRef, ref);
1281         }
1282         return ref;
1283     }
1284
1285     private MarkerType markerTypeFossil(){
1286         if(this.markerTypeFossil == null){
1287             markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1288             getTermService().save(this.markerTypeFossil);
1289         }
1290         return markerTypeFossil;
1291     }
1292
1293     private String csvReportLine(String regId, String message, String ... fields){
1294         StringBuilder out = new StringBuilder("regID#");
1295         out.append(regId).append(",\"").append(message).append('"');
1296
1297         for(String f : fields){
1298             out.append(",\"").append(f).append('"');
1299         }
1300         return out.toString();
1301     }
1302
1303
1304 }