app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java

   1 /**
   2  * Copyright (C) 2007 EDIT
   3  * European Distributed Institute of Taxonomy
   4  * http://www.e-taxonomy.eu
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version 1.1
   7  * See LICENSE.TXT at the top of this package for the full license terms.
   8  */
   9
  10 package eu.etaxonomy.cdm.io.iapt;
  11
  12 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
  13 import eu.etaxonomy.cdm.api.service.pager.Pager;
  14 import eu.etaxonomy.cdm.common.CdmUtils;
  15 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
  16 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
  17 import eu.etaxonomy.cdm.model.agent.Institution;
  18 import eu.etaxonomy.cdm.model.agent.Person;
  19 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
  20 import eu.etaxonomy.cdm.model.common.*;
  21 import eu.etaxonomy.cdm.model.name.*;
  22 import eu.etaxonomy.cdm.model.occurrence.*;
  23 import eu.etaxonomy.cdm.model.occurrence.Collection;
  24 import eu.etaxonomy.cdm.model.reference.Reference;
  25 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  26 import eu.etaxonomy.cdm.model.reference.ReferenceType;
  27 import eu.etaxonomy.cdm.model.taxon.*;
  28 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  29 import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
  30 import org.apache.commons.lang.ArrayUtils;
  31 import org.apache.commons.lang.StringEscapeUtils;
  32 import org.apache.commons.lang.StringUtils;
  33 import org.apache.log4j.Level;
  34 import org.apache.log4j.Logger;
  35 import org.joda.time.DateTimeFieldType;
  36 import org.joda.time.Partial;
  37 import org.joda.time.format.DateTimeFormat;
  38 import org.joda.time.format.DateTimeFormatter;
  39 import org.springframework.stereotype.Component;
  40
  41 import java.util.*;
  42 import java.util.regex.Matcher;
  43 import java.util.regex.Pattern;
  44
  45 /**
  46  * @author a.mueller
  47  * @created 05.01.2016
  48  */
  49
  50 @Component("iAPTExcelImport")
  51 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
  52     private static final long serialVersionUID = -747486709409732371L;
  53     private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
  54     public static final String ANNOTATION_MARKER_STRING = "[*]";
  55
  56
  57     private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
  58
  59     private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
  60
  61     private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
  62     private final static String HIGHERTAXON= "HigherTaxon";
  63     private final static String FULLNAME= "FullName";
  64     private final static String AUTHORSSPELLING= "AuthorsSpelling";
  65     private final static String LITSTRING= "LitString";
  66     private final static String REGISTRATION= "Registration";
  67     private final static String TYPE= "Type";
  68     private final static String CAVEATS= "Caveats";
  69     private final static String FULLBASIONYM= "FullBasionym";
  70     private final static String FULLSYNSUBST= "FullSynSubst";
  71     private final static String NOTESTXT= "NotesTxt";
  72     private final static String REGDATE= "RegDate";
  73     private final static String NAMESTRING= "NameString";
  74     private final static String BASIONYMSTRING= "BasionymString";
  75     private final static String SYNSUBSTSTR= "SynSubstStr";
  76     private final static String AUTHORSTRING= "AuthorString";
  77
  78     private  static List<String> expectedKeys= Arrays.asList(new String[]{
  79             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
  80
  81     private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
  82     private static final Pattern[] datePatterns = new Pattern[]{
  83             // NOTE:
  84             // The order of the patterns is extremely important!!!
  85             //
  86             // all patterns cover the years 1700 - 1999
  87             Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
  88             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
  89             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
  90             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
  91             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
  92             Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
  93             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
  94             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
  95             Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
  96             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
  97         };
  98     private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
  99
 100     private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
 101     private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
 102     private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
 103
 104     private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
 105     private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
 106     private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
 107
 108     // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
 109     private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
 110
 111     private static final Pattern[] specimenTypePatterns = new Pattern[]{
 112             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
 113             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
 114             Pattern.compile("^(?<Collection>:Coll\\.\\s.*?)\\s(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
 115             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
 116     };
 117
 118     private static Map<String, Integer> monthFromNameMap = new HashMap<>();
 119
 120     static {
 121         String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
 122         String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
 123         String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
 124         String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
 125         String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
 126         String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
 127         String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
 128         String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
 129         String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
 130         String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
 131
 132         String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
 133
 134         for (String[] months: perLang) {
 135             for(int m = 1; m < 13; m++){
 136                 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
 137             }
 138         }
 139
 140         // special cases
 141         monthFromNameMap.put("mar", 3);
 142         monthFromNameMap.put("dec", 12);
 143         monthFromNameMap.put("Februari", 2);
 144     }
 145
 146
 147     DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
 148
 149     private Map<String, Collection> collectionMap = new HashMap<>();
 150
 151
 152     enum TypesName {
 153         fieldUnit, holotype, isotype;
 154
 155         public SpecimenTypeDesignationStatus status(){
 156             switch (this) {
 157                 case holotype:
 158                     return SpecimenTypeDesignationStatus.HOLOTYPE();
 159                 case isotype:
 160                     return SpecimenTypeDesignationStatus.ISOTYPE();
 161                 default:
 162                     return null;
 163             }
 164         }
 165     }
 166
 167     private MarkerType markerTypeFossil = null;
 168     private Rank rankUnrankedSupraGeneric = null;
 169     private Rank familyIncertisSedis = null;
 170     private AnnotationType annotationTypeCaveats = null;
 171
 172     private Reference bookVariedadesTradicionales = null;
 173
 174     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
 175                             TaxonNode higherTaxonNode, boolean isFossil) {
 176
 177         String regNumber = getValue(record, REGISTRATIONNO_PK, false);
 178         String regStr = getValue(record, REGISTRATION, true);
 179         String titleCacheStr = getValue(record, FULLNAME, true);
 180         String nameStr = getValue(record, NAMESTRING, true);
 181         String authorStr = getValue(record, AUTHORSTRING, true);
 182         String nomRefStr = getValue(record, LITSTRING, true);
 183         String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
 184         String notesTxt = getValue(record, NOTESTXT, true);
 185         String caveats = getValue(record, CAVEATS, true);
 186         String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
 187         String fullBasionymStr = getValue(record, FULLBASIONYM, true);
 188         String basionymNameStr = getValue(record, FULLBASIONYM, true);
 189         String synSubstStr = getValue(record, SYNSUBSTSTR, true);
 190         String typeStr = getValue(record, TYPE, true);
 191
 192
 193         String nomRefTitle = null;
 194         String nomRefDetail;
 195         String nomRefPupDate = null;
 196         String nomRefIssue = null;
 197         Partial pupDate = null;
 198
 199         boolean restoreOriginalReference = false;
 200         boolean nameIsValid = true;
 201
 202         // preprocess nomRef: separate citation, reference detail, publishing date
 203         if(!StringUtils.isEmpty(nomRefStr)){
 204             nomRefStr = nomRefStr.trim();
 205
 206             // handle the special case which is hard to parse:
 207             //
 208             // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
 209             if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
 210
 211                 if(bookVariedadesTradicionales == null){
 212                     bookVariedadesTradicionales = ReferenceFactory.newBook();
 213                     bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
 214                     bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
 215                     getReferenceService().save(bookVariedadesTradicionales);
 216                 }
 217                 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
 218                 restoreOriginalReference = true;
 219             }
 220
 221             Matcher m = nomRefTokenizeP.matcher(nomRefStr);
 222             if(m.matches()){
 223                 nomRefTitle = m.group("title");
 224                 nomRefDetail = m.group("detail");
 225                 nomRefPupDate = m.group("date").trim();
 226                 nomRefIssue = m.group("issue");
 227
 228                 pupDate = parseDate(regNumber, nomRefPupDate);
 229                 if (pupDate != null) {
 230                     nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
 231                 } else {
 232                     logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
 233                 }
 234             } else {
 235                 nomRefTitle = nomRefStr;
 236             }
 237         }
 238
 239         BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
 240
 241         // always add the original strings of parsed data as annotation
 242         taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
 243                         "\n -  '" + LITSTRING + "': "+ nomRefStr +
 244                         "\n -  '" + TYPE + "': " + typeStr +
 245                         "\n -  '" + REGISTRATION  + "': " + regStr
 246                 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
 247
 248         if(restoreOriginalReference){
 249             taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
 250         }
 251         if(pupDate != null) {
 252             taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
 253         }
 254         if(nomRefIssue != null) {
 255             ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
 256         }
 257
 258
 259         if(!StringUtils.isEmpty(notesTxt)){
 260             notesTxt = notesTxt.replace("Notes: ", "").trim();
 261             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
 262             nameIsValid = false;
 263
 264         }
 265         if(!StringUtils.isEmpty(caveats)){
 266             caveats = caveats.replace("Caveats: ", "").trim();
 267             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
 268             nameIsValid = false;
 269         }
 270
 271         if(nameIsValid){
 272             // Status is always considered valid if no notes and cavets are set
 273             taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
 274         }
 275
 276         getNameService().save(taxonName);
 277
 278         // Namerelations
 279         if(!StringUtils.isEmpty(authorsSpelling)){
 280             authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
 281
 282             String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
 283             String[] nameStrTokens = StringUtils.split(nameStr, " ");
 284
 285             ArrayUtils.reverse(authorSpellingTokens);
 286             ArrayUtils.reverse(nameStrTokens);
 287
 288             for (int i = 0; i < nameStrTokens.length; i++){
 289                 if(i < authorSpellingTokens.length){
 290                     nameStrTokens[i] = authorSpellingTokens[i];
 291                 }
 292             }
 293             ArrayUtils.reverse(nameStrTokens);
 294
 295             String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
 296             // build the fullnameString of the misspelled name
 297             misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
 298
 299             TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
 300             misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
 301             getNameService().save(misspelledName);
 302         }
 303
 304         // Replaced Synonyms
 305         if(!StringUtils.isEmpty(fullSynSubstStr)){
 306             fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
 307             BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
 308             replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
 309             getNameService().save(replacedSynonymName);
 310         }
 311
 312         Reference sec = state.getConfig().getSecReference();
 313         Taxon taxon = Taxon.NewInstance(taxonName, sec);
 314
 315         // Basionym
 316         if(fullBasionymStr != null){
 317             fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 318             basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 319             BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
 320             getNameService().save(basionym);
 321             taxonName.addBasionym(basionym);
 322
 323             Synonym syn = Synonym.NewInstance(basionym, sec);
 324             taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
 325             getTaxonService().save(syn);
 326         }
 327
 328         // Markers
 329         if(isFossil){
 330             taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
 331         }
 332
 333         // Types
 334         if(!StringUtils.isEmpty(typeStr)){
 335
 336             if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
 337                 makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
 338             } else {
 339                 makeNameTypeData(typeStr, taxonName, regNumber, state);
 340             }
 341         }
 342
 343         getTaxonService().save(taxon);
 344
 345         if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
 346             // try to find the genus, it should have been imported already, Genera are coming first in the import file
 347             Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
 348             if(genus != null){
 349                 higherTaxonNode = genus.getTaxonNodes().iterator().next();
 350             } else {
 351                 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
 352             }
 353         }
 354
 355         if(higherTaxonNode != null){
 356             higherTaxonNode.addChildTaxon(taxon, null, null);
 357             getTaxonNodeService().save(higherTaxonNode);
 358         }
 359
 360         if(taxonName.getRank().isGenus()){
 361             ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
 362         }
 363
 364         return taxon;
 365     }
 366
 367     private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 368
 369         Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
 370
 371         if(m.matches()){
 372             String fieldUnitStr = m.group(TypesName.fieldUnit.name());
 373             // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
 374             FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
 375             if(fieldUnit == null) {
 376                 // create a field unit with only a titleCache using the fieldUnitStr substring
 377                 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
 378                 fieldUnit = FieldUnit.NewInstance();
 379                 fieldUnit.setTitleCache(fieldUnitStr, true);
 380                 getOccurrenceService().save(fieldUnit);
 381             }
 382             getOccurrenceService().save(fieldUnit);
 383
 384             // all others ..
 385             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
 386             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
 387
 388         } else {
 389             // create a field unit with only a titleCache using the full typeStr
 390             FieldUnit fieldUnit = FieldUnit.NewInstance();
 391             fieldUnit.setTitleCache(typeStr, true);
 392             getOccurrenceService().save(fieldUnit);
 393             logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
 394         }
 395         getNameService().save(taxonName);
 396     }
 397
 398     private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 399
 400         String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
 401         if(nameStr.isEmpty()) {
 402             return;
 403         }
 404
 405         String basionymNameStr = null;
 406         String noteStr = null;
 407         String agentStr = null;
 408
 409         Matcher m;
 410
 411         if(typeStr.startsWith("not to be indicated")){
 412             // Special case:
 413             // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
 414             // FIXME
 415             m = typeNameSpecialSplitPattern.matcher(nameStr);
 416             if(m.matches()){
 417                 nameStr = m.group("name");
 418                 noteStr = m.group("note");
 419                 agentStr = m.group("agent");
 420                 // TODO better import of agent?
 421                 if(agentStr != null){
 422                     noteStr = noteStr + ": " + agentStr;
 423                 }
 424             }
 425         } else {
 426             // Generic case
 427             m = typeNameBasionymPattern.matcher(nameStr);
 428             if (m.find()) {
 429                 basionymNameStr = m.group("basionymName");
 430                 if (basionymNameStr != null) {
 431                     nameStr = nameStr.replace(m.group(0), "");
 432                 }
 433             }
 434
 435             m = typeNameNotePattern.matcher(nameStr);
 436             if (m.find()) {
 437                 noteStr = m.group(1);
 438                 if (noteStr != null) {
 439                     nameStr = nameStr.replace(m.group(0), "");
 440                 }
 441             }
 442         }
 443
 444         BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 445
 446         if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
 447             logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
 448         }
 449
 450         if(basionymNameStr != null){
 451             BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 452             getNameService().save(basionymName);
 453             typeName.addBasionym(basionymName);
 454         }
 455
 456
 457         NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
 458         nameTypeDesignation.setTypeName(typeName);
 459         getNameService().save(typeName);
 460
 461         if(noteStr != null){
 462             nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
 463         }
 464         taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
 465
 466     }
 467
 468     /**
 469      * Currently only parses the collector, fieldNumber and the collection date.
 470      *
 471      * @param fieldUnitStr
 472      * @param regNumber
 473      * @param state
 474      * @return null if the fieldUnitStr could not be parsed
 475      */
 476     private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 477
 478         FieldUnit fieldUnit = null;
 479
 480         Matcher m1 = collectorPattern.matcher(fieldUnitStr);
 481         if(m1.matches()){
 482
 483             String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
 484             String removal = m1.group(1);
 485             if(collectorData == null){
 486                 collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
 487                 removal = m1.group(3);
 488             }
 489             if(collectorData == null){
 490                 return null;
 491             }
 492
 493             // the fieldUnitStr is parsable
 494             // remove all collectorData from the fieldUnitStr and use the rest as locality
 495             String locality = fieldUnitStr.replace(removal, "");
 496
 497             String collectorStr = null;
 498             String detailStr = null;
 499             Partial date = null;
 500             String fieldNumber = null;
 501
 502             Matcher m2 = collectionDataPattern.matcher(collectorData);
 503             if(m2.matches()){
 504                 collectorStr = m2.group("collector");
 505                 detailStr = m2.group("detail");
 506
 507                 // Try to make sense of the detailStr
 508                 if(detailStr != null){
 509                     detailStr = detailStr.trim();
 510                     // 1. try to parse as date
 511                     date = parseDate(regNumber, detailStr);
 512                     if(date == null){
 513                         // 2. try to parse as number
 514                         if(collectorsNumber.matcher(detailStr).matches()){
 515                             fieldNumber = detailStr;
 516                         }
 517                     }
 518                 }
 519                 if(date == null && fieldNumber == null){
 520                     // detailed parsing not possible, so need fo fallback
 521                     collectorStr = collectorData;
 522                 }
 523             }
 524
 525             if(collectorStr == null) {
 526                 collectorStr = collectorData;
 527             }
 528
 529             fieldUnit = FieldUnit.NewInstance();
 530             GatheringEvent ge = GatheringEvent.NewInstance();
 531             ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
 532
 533             TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
 534             if(agent == null) {
 535                 agent = Person.NewTitledInstance(collectorStr);
 536                 getAgentService().save(agent);
 537                 state.putAgentBase(collectorStr, agent);
 538             }
 539             ge.setCollector(agent);
 540
 541             if(date != null){
 542                 ge.setGatheringDate(date);
 543             }
 544
 545             getEventBaseService().save(ge);
 546             fieldUnit.setGatheringEvent(ge);
 547
 548             if(fieldNumber != null) {
 549                 fieldUnit.setFieldNumber(fieldNumber);
 550             }
 551             getOccurrenceService().save(fieldUnit);
 552
 553         }
 554
 555         return fieldUnit;
 556     }
 557
 558     private Partial parseDate(String regNumber, String dateStr) {
 559
 560         Partial pupDate = null;
 561         boolean parseError = false;
 562
 563         String day = null;
 564         String month = null;
 565         String monthName = null;
 566         String year = null;
 567
 568         for(Pattern p : datePatterns){
 569             Matcher m2 = p.matcher(dateStr);
 570             if(m2.matches()){
 571                 try {
 572                     year = m2.group("year");
 573                 } catch (IllegalArgumentException e){
 574                     // named capture group not found
 575                 }
 576                 try {
 577                     month = m2.group("month");
 578                 } catch (IllegalArgumentException e){
 579                     // named capture group not found
 580                 }
 581
 582                 try {
 583                     monthName = m2.group("monthName");
 584                     month = monthFromName(monthName, regNumber);
 585                     if(month == null){
 586                         parseError = true;
 587                     }
 588                 } catch (IllegalArgumentException e){
 589                     // named capture group not found
 590                 }
 591                 try {
 592                     day = m2.group("day");
 593                 } catch (IllegalArgumentException e){
 594                     // named capture group not found
 595                 }
 596
 597                 if(year != null){
 598                     if (year.length() == 2) {
 599                         // it is an abbreviated year from the 19** years
 600                         year = "19" + year;
 601                     }
 602                     break;
 603                 } else {
 604                     parseError = true;
 605                 }
 606             }
 607         }
 608         if(year == null){
 609             parseError = true;
 610         }
 611         List<DateTimeFieldType> types = new ArrayList<>();
 612         List<Integer> values = new ArrayList<>();
 613         if(!parseError) {
 614             types.add(DateTimeFieldType.year());
 615             values.add(Integer.parseInt(year));
 616             if (month != null) {
 617                 types.add(DateTimeFieldType.monthOfYear());
 618                 values.add(Integer.parseInt(month));
 619             }
 620             if (day != null) {
 621                 types.add(DateTimeFieldType.dayOfMonth());
 622                 values.add(Integer.parseInt(day));
 623             }
 624             pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
 625         }
 626         return pupDate;
 627     }
 628
 629     private String monthFromName(String monthName, String regNumber) {
 630
 631         Integer month = monthFromNameMap.get(monthName.toLowerCase());
 632         if(month == null){
 633             logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
 634             return null;
 635         } else {
 636             return month.toString();
 637         }
 638     }
 639
 640
 641     private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
 642
 643         if(StringUtils.isEmpty(typeStr)){
 644             return;
 645         }
 646         typeStr = typeStr.trim().replaceAll("\\.$", "");
 647
 648         Collection collection = null;
 649         DerivedUnit specimen = null;
 650
 651         List<DerivedUnit> specimens = new ArrayList<>();
 652         if(multiple){
 653             String[] tokens = typeStr.split("\\s?,\\s?");
 654             for (String t : tokens) {
 655                 // command to  list all complex parsabel types:
 656                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
 657                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
 658
 659                 if(!t.isEmpty()){
 660                     // trying to parse the string
 661                     specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
 662                     if(specimen != null){
 663                         specimens.add(specimen);
 664                     } else {
 665                         // parsing was not successful make simple specimen
 666                         specimens.add(makeSpecimenType(fieldUnit, t));
 667                     }
 668                 }
 669             }
 670         } else {
 671             specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
 672             if(specimen != null) {
 673                 specimens.add(specimen);
 674                 // remember current collection
 675                 collection = specimen.getCollection();
 676             } else {
 677                 // parsing was not successful make simple specimen
 678                 specimens.add(makeSpecimenType(fieldUnit, typeStr));
 679             }
 680         }
 681
 682         for(DerivedUnit s : specimens){
 683             taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
 684        }
 685     }
 686
 687     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
 688         DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 689         facade.setTitleCache(titleCache.trim(), true);
 690         specimen = facade.innerDerivedUnit();
 691         return specimen;
 692     }
 693
 694     /**
 695      *
 696      * @param fieldUnit
 697      * @param typeName
 698      * @param collection
 699      * @param text
 700      * @param regNumber
 701      * @return
 702      */
 703     private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 704
 705         DerivedUnit specimen = null;
 706
 707         String collectionCode = null;
 708         String subCollectionStr = null;
 709         String instituteStr = null;
 710         String accessionNumber = null;
 711
 712         boolean unusualAccessionNumber = false;
 713
 714         text = text.trim();
 715
 716         // 1.  For Isotypes often the accession number is noted alone if the
 717         //     preceeding entry has a collection code.
 718         if(typeName .equals(TypesName.isotype) && collection != null){
 719             Matcher m = accessionNumberOnlyPattern.matcher(text);
 720             if(m.matches()){
 721                 try {
 722                     accessionNumber = m.group("accNumber");
 723                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 724                 } catch (IllegalArgumentException e){
 725                     // match group acc_number not found
 726                 }
 727             }
 728         }
 729
 730         //2. try it the 'normal' way
 731         if(specimen == null) {
 732             for (Pattern p : specimenTypePatterns) {
 733                 Matcher m = p.matcher(text);
 734                 if (m.matches()) {
 735                     // collection code is mandatory
 736                     try {
 737                         collectionCode = m.group("colCode");
 738                     } catch (IllegalArgumentException e){
 739                         // match group colCode not found
 740                     }
 741                     try {
 742                         subCollectionStr = m.group("subCollection");
 743                     } catch (IllegalArgumentException e){
 744                         // match group subCollection not found
 745                     }
 746                     try {
 747                         instituteStr = m.group("institute");
 748                     } catch (IllegalArgumentException e){
 749                         // match group col_name not found
 750                     }
 751                     try {
 752                         accessionNumber = m.group("accNumber");
 753
 754                         // try to improve the accessionNumber
 755                         if(accessionNumber!= null) {
 756                             accessionNumber = accessionNumber.trim();
 757                             Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
 758                             String betterAccessionNumber = null;
 759                             if (m2.matches()) {
 760                                 try {
 761                                     betterAccessionNumber = m.group("accNumber");
 762                                 } catch (IllegalArgumentException e) {
 763                                     // match group acc_number not found
 764                                 }
 765                             }
 766                             if (betterAccessionNumber != null) {
 767                                 accessionNumber = betterAccessionNumber;
 768                             } else {
 769                                 unusualAccessionNumber = true;
 770                             }
 771                         }
 772
 773                     } catch (IllegalArgumentException e){
 774                         // match group acc_number not found
 775                     }
 776
 777                     if(collectionCode == null && instituteStr == null){
 778                         logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
 779                         continue;
 780                     }
 781                     collection = getCollection(collectionCode, instituteStr, subCollectionStr);
 782                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 783                     break;
 784                 }
 785             }
 786         }
 787         if(specimen == null) {
 788             logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
 789         }
 790         if(unusualAccessionNumber){
 791             logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
 792         }
 793         return specimen;
 794     }
 795
 796     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
 797
 798         DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 799         facade.setCollection(collection);
 800         if(accessionNumber != null){
 801             facade.setAccessionNumber(accessionNumber);
 802         }
 803         return facade.innerDerivedUnit();
 804     }
 805
 806     private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
 807                                             String authorStr, String nomRefTitle) {
 808
 809         BotanicalName taxonName;// cache field for the taxonName.titleCache
 810         String taxonNameTitleCache = null;
 811         Map<String, AnnotationType> nameAnnotations = new HashMap<>();
 812
 813         // TitleCache preprocessing
 814         if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
 815             nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
 816             titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 817             if(authorStr != null) {
 818                 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 819             }
 820         }
 821
 822         // parse the full taxon name
 823         if(!StringUtils.isEmpty(nomRefTitle)){
 824             String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
 825             String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
 826             logger.debug(":::::" + taxonFullNameStr);
 827             taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
 828         } else {
 829             taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
 830         }
 831
 832         taxonNameTitleCache = taxonName.getTitleCache().trim();
 833         if (taxonName.isProtectedTitleCache()) {
 834             logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
 835         } else {
 836
 837             boolean doRestoreTitleCacheStr = false;
 838
 839             // Check if titleCache and nameCache are plausible
 840             String titleCacheCompareStr = titleCacheStr;
 841             String nameCache = taxonName.getNameCache();
 842             String nameCompareStr = nameStr;
 843             if(taxonName.isBinomHybrid()){
 844                 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
 845                 nameCompareStr = nameCompareStr.replace(" x ", " ×");
 846             }
 847             if(taxonName.isMonomHybrid()){
 848                 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
 849                 nameCompareStr = nameCompareStr.replace("^X ", "× ");
 850             }
 851             if(authorStr != null && authorStr.contains(" et ")){
 852                 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
 853             }
 854             if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
 855                 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
 856                 doRestoreTitleCacheStr = true;
 857             }
 858             if (!nameCache.trim().equals(nameCompareStr)) {
 859                 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
 860             }
 861
 862             //  Author
 863             //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
 864             //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
 865             //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
 866             //    doRestoreTitleCacheStr = true;
 867             //}
 868
 869             if(doRestoreTitleCacheStr){
 870                 taxonName.setTitleCache(titleCacheStr, true);
 871             }
 872
 873             // deduplicate
 874             replaceAuthorNamesAndNomRef(state, taxonName);
 875         }
 876
 877         // Annotations
 878         if(!nameAnnotations.isEmpty()){
 879             for(String text : nameAnnotations.keySet()){
 880                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
 881             }
 882             getNameService().save(taxonName);
 883         }
 884         return taxonName;
 885     }
 886
 887     /**
 888      * @param state
 889      * @return
 890      */
 891     private TaxonNode getClassificationRootNode(IAPTImportState state) {
 892
 893      //   Classification classification = state.getClassification();
 894      //   if (classification == null){
 895      //       IAPTImportConfigurator config = state.getConfig();
 896      //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
 897      //       classification.setUuid(config.getClassificationUuid());
 898      //       classification.setReference(config.getSecReference());
 899      //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
 900      //   }
 901         TaxonNode rootNode = state.getRootNode();
 902         if (rootNode == null){
 903             rootNode = getTaxonNodeService().find(ROOT_UUID);
 904         }
 905         if (rootNode == null){
 906             Classification classification = state.getClassification();
 907             if (classification == null){
 908                 Reference sec = state.getSecReference();
 909                 String classificationName = state.getConfig().getClassificationName();
 910                 Language language = Language.DEFAULT();
 911                 classification = Classification.NewInstance(classificationName, sec, language);
 912                 state.setClassification(classification);
 913                 classification.setUuid(state.getConfig().getClassificationUuid());
 914                 classification.getRootNode().setUuid(ROOT_UUID);
 915                 getClassificationService().save(classification);
 916             }
 917             rootNode = classification.getRootNode();
 918             state.setRootNode(rootNode);
 919         }
 920         return rootNode;
 921     }
 922
 923     private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
 924
 925         Collection superCollection = null;
 926         if(subCollectionStr != null){
 927             superCollection = getCollection(collectionCode, instituteStr, null);
 928             collectionCode = subCollectionStr;
 929             instituteStr = null;
 930         }
 931
 932         final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
 933
 934         Collection collection = collectionMap.get(key);
 935
 936         if(collection == null) {
 937             collection = Collection.NewInstance();
 938             collection.setCode(collectionCode);
 939             if(instituteStr != null){
 940                 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
 941             }
 942             if(superCollection != null){
 943                 collection.setSuperCollection(superCollection);
 944             }
 945             collectionMap.put(key, collection);
 946             getCollectionService().save(collection);
 947         }
 948
 949         return collection;
 950     }
 951
 952
 953     /**
 954      * @param record
 955      * @param originalKey
 956      * @param doUnescapeHtmlEntities
 957      * @return
 958      */
 959     private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
 960         String value = record.get(originalKey);
 961
 962         value = fixCharacters(value);
 963
 964         if (! StringUtils.isBlank(value)) {
 965                 if (logger.isDebugEnabled()) {
 966                     logger.debug(originalKey + ": " + value);
 967                 }
 968                 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
 969             if(doUnescapeHtmlEntities){
 970                 value = StringEscapeUtils.unescapeHtml(value);
 971             }
 972                 return value.trim();
 973         }else{
 974                 return null;
 975         }
 976     }
 977
 978     /**
 979      * Fixes broken characters.
 980      * For details see
 981      * http://dev.e-taxonomy.eu/redmine/issues/6035
 982      *
 983      * @param value
 984      * @return
 985      */
 986     private String fixCharacters(String value) {
 987
 988         value = StringUtils.replace(value, "s$K", "š");
 989         value = StringUtils.replace(value, "n$K", "ň");
 990         value = StringUtils.replace(value, "e$K", "ě");
 991         value = StringUtils.replace(value, "r$K", "ř");
 992         value = StringUtils.replace(value, "c$K", "č");
 993         value = StringUtils.replace(value, "z$K", "ž");
 994         value = StringUtils.replace(value, "S>U$K", "Š");
 995         value = StringUtils.replace(value, "C>U$K", "Č");
 996         value = StringUtils.replace(value, "R>U$K", "Ř");
 997         value = StringUtils.replace(value, "Z>U$K", "Ž");
 998         value = StringUtils.replace(value, "g$K", "ǧ");
 999         value = StringUtils.replace(value, "s$A", "ś");
1000         value = StringUtils.replace(value, "n$A", "ń");
1001         value = StringUtils.replace(value, "c$A", "ć");
1002         value = StringUtils.replace(value, "e$E", "ę");
1003         value = StringUtils.replace(value, "o$H", "õ");
1004         value = StringUtils.replace(value, "s$C", "ş");
1005         value = StringUtils.replace(value, "t$C", "ț");
1006         value = StringUtils.replace(value, "S>U$C", "Ş");
1007         value = StringUtils.replace(value, "a$O", "å");
1008         value = StringUtils.replace(value, "A>U$O", "Å");
1009         value = StringUtils.replace(value, "u$O", "ů");
1010         value = StringUtils.replace(value, "g$B", "ğ");
1011         value = StringUtils.replace(value, "g$B", "ĕ");
1012         value = StringUtils.replace(value, "a$B", "ă");
1013         value = StringUtils.replace(value, "l$/", "ł");
1014         value = StringUtils.replace(value, ">i", "ı");
1015         value = StringUtils.replace(value, "i$U", "ï");
1016         // Special-cases
1017         value = StringUtils.replace(value, "&yacute", "ý");
1018         value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1019         value = StringUtils.replace(value, "E>U$D", "З");
1020         value = StringUtils.replace(value, "S>U$E", "Ş");
1021         value = StringUtils.replace(value, "s$E", "ş");
1022
1023         value = StringUtils.replace(value, "c$k", "č");
1024         value = StringUtils.replace(value, " U$K", " Š");
1025
1026         value = StringUtils.replace(value, "B.O>U>!", "Ø");
1027         value = StringUtils.replace(value, "S$K", "Ŝ");
1028         value = StringUtils.replace(value, "§B>i", "ğ");
1029
1030
1031         return value;
1032     }
1033
1034
1035     /**
1036          *  Stores taxa records in DB
1037          */
1038         @Override
1039     protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1040
1041         String lineNumber = "L#" + state.getCurrentLine() + ": ";
1042         logger.setLevel(Level.DEBUG);
1043         HashMap<String, String> record = state.getOriginalRecord();
1044         logger.debug(lineNumber + record.toString());
1045
1046         Set<String> keys = record.keySet();
1047         for (String key: keys) {
1048             if (! expectedKeys.contains(key)){
1049                 logger.warn(lineNumber + "Unexpected Key: " + key);
1050             }
1051         }
1052
1053         String reg_id = record.get(REGISTRATIONNO_PK);
1054
1055         //higherTaxon
1056         String higherTaxaString = record.get(HIGHERTAXON);
1057         boolean isFossil = false;
1058         if(higherTaxaString.startsWith("FOSSIL ")){
1059             higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1060             isFossil = true;
1061         }
1062         TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1063
1064        //Taxon
1065         Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1066         if (taxon == null){
1067             logger.warn(lineNumber + "taxon could not be created and is null");
1068             return;
1069         }
1070         ((IAPTImportState)state).setCurrentTaxon(taxon);
1071
1072
1073         logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1074                 return;
1075     }
1076
1077     private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1078         String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1079         TaxonNode higherTaxonNode = null;
1080
1081         ITaxonTreeNode rootNode = getClassificationRootNode(state);
1082         for (String htn :  higherTaxaNames) {
1083             htn = StringUtils.capitalize(htn.trim());
1084             Taxon higherTaxon = state.getHigherTaxon(htn);
1085             if (higherTaxon != null){
1086                 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1087             }else{
1088                 BotanicalName name = makeHigherTaxonName(state, htn);
1089                 Reference sec = state.getSecReference();
1090                 higherTaxon = Taxon.NewInstance(name, sec);
1091                 getTaxonService().save(higherTaxon);
1092                 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1093                 state.putHigherTaxon(htn, higherTaxon);
1094                 getClassificationService().saveTreeNode(higherTaxonNode);
1095             }
1096             rootNode = higherTaxonNode;
1097         }
1098         return higherTaxonNode;
1099     }
1100
1101     private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1102
1103         Rank rank = guessRank(name);
1104
1105         BotanicalName taxonName = BotanicalName.NewInstance(rank);
1106         taxonName.addSource(makeOriginalSource(state));
1107         taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1108         return taxonName;
1109     }
1110
1111     private Rank guessRank(String name) {
1112
1113         // normalize
1114         name = name.replaceAll("\\(.*\\)", "").trim();
1115
1116         if(name.matches("^Plantae$|^Fungi$")){
1117            return Rank.KINGDOM();
1118         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1119            return rankFamilyIncertisSedis();
1120         } else if(name.matches(".*phyta$|.*mycota$")){
1121            return Rank.PHYLUM();
1122         } else if(name.matches(".*phytina$|.*mycotina$")){
1123            return Rank.SUBPHYLUM();
1124         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1125             return rankUnrankedSupraGeneric();
1126         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1127            return Rank.CLASS();
1128         } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1129            return Rank.SUBCLASS();
1130         } else if(name.matches(".*ales$")){
1131            return Rank.ORDER();
1132         } else if(name.matches(".*ineae$")){
1133            return Rank.SUBORDER();
1134         } else if(name.matches(".*aceae$")){
1135             return Rank.FAMILY();
1136         } else if(name.matches(".*oideae$")){
1137            return Rank.SUBFAMILY();
1138         } else
1139         //    if(name.matches(".*eae$")){
1140         //    return Rank.TRIBE();
1141         // } else
1142             if(name.matches(".*inae$")){
1143            return Rank.SUBTRIBE();
1144         } else if(name.matches(".*ae$")){
1145            return Rank.FAMILY();
1146         }
1147         return Rank.UNKNOWN_RANK();
1148     }
1149
1150     private Rank rankUnrankedSupraGeneric() {
1151
1152         if(rankUnrankedSupraGeneric == null){
1153             rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1154             getTermService().save(rankUnrankedSupraGeneric);
1155         }
1156         return rankUnrankedSupraGeneric;
1157     }
1158
1159     private Rank rankFamilyIncertisSedis() {
1160
1161         if(familyIncertisSedis == null){
1162             familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1163             getTermService().save(familyIncertisSedis);
1164         }
1165         return familyIncertisSedis;
1166     }
1167
1168     private AnnotationType annotationTypeCaveats(){
1169         if(annotationTypeCaveats == null){
1170             annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1171             getTermService().save(annotationTypeCaveats);
1172         }
1173         return annotationTypeCaveats;
1174     }
1175
1176
1177     /**
1178      * @param state
1179      * @return
1180      */
1181     private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1182         return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1183     }
1184
1185
1186     private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1187         Reference ref = state.getReference(uuidRef);
1188         if (ref == null){
1189             ref = getReferenceService().find(uuidRef);
1190             state.putReference(uuidRef, ref);
1191         }
1192         return ref;
1193     }
1194
1195     private MarkerType markerTypeFossil(){
1196         if(this.markerTypeFossil == null){
1197             markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1198             getTermService().save(this.markerTypeFossil);
1199         }
1200         return markerTypeFossil;
1201     }
1202
1203     private String csvReportLine(String regId, String message, String ... fields){
1204         StringBuilder out = new StringBuilder("regID#");
1205         out.append(regId).append(",\"").append(message).append('"');
1206
1207         for(String f : fields){
1208             out.append(",\"").append(f).append('"');
1209         }
1210         return out.toString();
1211     }
1212
1213
1214 }