app-import/src/main/java/eu/etaxonomy/cdm/io/phycobank/IAPTExcelImport.java

   1 /**
   2  * Copyright (C) 2007 EDIT
   3  * European Distributed Institute of Taxonomy
   4  * http://www.e-taxonomy.eu
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version 1.1
   7  * See LICENSE.TXT at the top of this package for the full license terms.
   8  */
   9
  10 package eu.etaxonomy.cdm.io.phycobank;
  11
  12 import java.util.ArrayList;
  13 import java.util.Arrays;
  14 import java.util.HashMap;
  15 import java.util.HashSet;
  16 import java.util.List;
  17 import java.util.Map;
  18 import java.util.Set;
  19 import java.util.UUID;
  20 import java.util.regex.Matcher;
  21 import java.util.regex.Pattern;
  22
  23 import org.apache.commons.lang.ArrayUtils;
  24 import org.apache.commons.lang.StringEscapeUtils;
  25 import org.apache.commons.lang.StringUtils;
  26 import org.apache.logging.log4j.Level;
  27 import org.apache.logging.log4j.LogManager;
  28 import org.apache.logging.log4j.Logger;
  29 import org.joda.time.DateTimeFieldType;
  30 import org.joda.time.Partial;
  31 import org.joda.time.format.DateTimeFormat;
  32 import org.joda.time.format.DateTimeFormatter;
  33 import org.springframework.stereotype.Component;
  34
  35 import com.fasterxml.jackson.core.JsonProcessingException;
  36 import com.fasterxml.jackson.databind.ObjectMapper;
  37
  38 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
  39 import eu.etaxonomy.cdm.common.CdmUtils;
  40 import eu.etaxonomy.cdm.common.LogUtils;
  41 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
  42 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
  43 import eu.etaxonomy.cdm.model.agent.Institution;
  44 import eu.etaxonomy.cdm.model.agent.Person;
  45 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
  46 import eu.etaxonomy.cdm.model.common.Annotation;
  47 import eu.etaxonomy.cdm.model.common.AnnotationType;
  48 import eu.etaxonomy.cdm.model.common.Extension;
  49 import eu.etaxonomy.cdm.model.common.ExtensionType;
  50 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
  51 import eu.etaxonomy.cdm.model.common.Language;
  52 import eu.etaxonomy.cdm.model.common.LanguageString;
  53 import eu.etaxonomy.cdm.model.common.Marker;
  54 import eu.etaxonomy.cdm.model.common.MarkerType;
  55 import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
  56 import eu.etaxonomy.cdm.model.name.IBotanicalName;
  57 import eu.etaxonomy.cdm.model.name.NameRelationshipType;
  58 import eu.etaxonomy.cdm.model.name.NameTypeDesignation;
  59 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
  60 import eu.etaxonomy.cdm.model.name.NomenclaturalStatus;
  61 import eu.etaxonomy.cdm.model.name.NomenclaturalStatusType;
  62 import eu.etaxonomy.cdm.model.name.Rank;
  63 import eu.etaxonomy.cdm.model.name.RankClass;
  64 import eu.etaxonomy.cdm.model.name.SpecimenTypeDesignationStatus;
  65 import eu.etaxonomy.cdm.model.name.TaxonName;
  66 import eu.etaxonomy.cdm.model.name.TaxonNameFactory;
  67 import eu.etaxonomy.cdm.model.occurrence.Collection;
  68 import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
  69 import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
  70 import eu.etaxonomy.cdm.model.occurrence.GatheringEvent;
  71 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
  72 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
  73 import eu.etaxonomy.cdm.model.reference.Reference;
  74 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  75 import eu.etaxonomy.cdm.model.taxon.Classification;
  76 import eu.etaxonomy.cdm.model.taxon.ITaxonTreeNode;
  77 import eu.etaxonomy.cdm.model.taxon.Synonym;
  78 import eu.etaxonomy.cdm.model.taxon.SynonymType;
  79 import eu.etaxonomy.cdm.model.taxon.Taxon;
  80 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
  81 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
  82 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  83
  84 /**
  85  * @author a.mueller
  86  * @since 05.01.2016
  87  */
  88
  89 @Component("iAPTExcelImport")
  90 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
  91     private static final long serialVersionUID = -747486709409732371L;
  92     private static final Logger logger = LogManager.getLogger();
  93
  94     public static final String ANNOTATION_MARKER_STRING = "[*]";
  95
  96     private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
  97
  98     private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
  99
 100     private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
 101     private final static String HIGHERTAXON= "HigherTaxon";
 102     private final static String FULLNAME= "FullName";
 103     private final static String AUTHORSSPELLING= "AuthorsSpelling";
 104     private final static String LITSTRING= "LitString";
 105     private final static String REGISTRATION= "Registration";
 106     private final static String TYPE= "Type";
 107     private final static String CAVEATS= "Caveats";
 108     private final static String FULLBASIONYM= "FullBasionym";
 109     private final static String FULLSYNSUBST= "FullSynSubst";
 110     private final static String NOTESTXT= "NotesTxt";
 111     private final static String REGDATE= "RegDate";
 112     private final static String NAMESTRING= "NameString";
 113     private final static String BASIONYMSTRING= "BasionymString";
 114     private final static String SYNSUBSTSTR= "SynSubstStr";
 115     private final static String AUTHORSTRING= "AuthorString";
 116
 117     private  static List<String> expectedKeys= Arrays.asList(new String[]{
 118             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
 119
 120     private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
 121     private static final Pattern[] datePatterns = new Pattern[]{
 122             // NOTE:
 123             // The order of the patterns is extremely important!!!
 124             //
 125             // all patterns cover the years 1700 - 1999
 126             Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
 127             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
 128             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
 129             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
 130             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
 131             Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
 132             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
 133             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
 134             Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
 135             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
 136         };
 137     protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
 138
 139     private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
 140     private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
 141     private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
 142
 143     protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
 144     private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
 145     private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
 146
 147     // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
 148     private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
 149
 150     private static final Pattern[] specimenTypePatterns = new Pattern[]{
 151             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
 152             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
 153             Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
 154             Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
 155             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
 156     };
 157
 158
 159     private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
 160
 161     private static Map<String, Integer> monthFromNameMap = new HashMap<>();
 162
 163     static {
 164         String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
 165         String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
 166         String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
 167         String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
 168         String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
 169         String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
 170         String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
 171         String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
 172         String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
 173         String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
 174
 175         String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
 176
 177         for (String[] months: perLang) {
 178             for(int m = 1; m < 13; m++){
 179                 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
 180             }
 181         }
 182
 183         // special cases
 184         monthFromNameMap.put("mar", 3);
 185         monthFromNameMap.put("dec", 12);
 186         monthFromNameMap.put("februari", 2);
 187         monthFromNameMap.put("març", 3);
 188     }
 189
 190
 191     DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
 192
 193     private Map<String, Collection> collectionMap = new HashMap<>();
 194
 195     private ExtensionType extensionTypeIAPTRegData = null;
 196
 197     private Set<String> nameSet = new HashSet<>();
 198     private DefinedTermBase duplicateRegistration = null;
 199
 200     enum TypesName {
 201         fieldUnit, holotype, isotype;
 202
 203         public SpecimenTypeDesignationStatus status(){
 204             switch (this) {
 205                 case holotype:
 206                     return SpecimenTypeDesignationStatus.HOLOTYPE();
 207                 case isotype:
 208                     return SpecimenTypeDesignationStatus.ISOTYPE();
 209                 default:
 210                     return null;
 211             }
 212         }
 213     }
 214
 215     private MarkerType markerTypeFossil = null;
 216     private Rank rankUnrankedSupraGeneric = null;
 217     private Rank familyIncertisSedis = null;
 218     private AnnotationType annotationTypeCaveats = null;
 219
 220     private Reference bookVariedadesTradicionales = null;
 221
 222     /**
 223      * HACK for unit simple testing
 224      */
 225     boolean _testMode = System.getProperty("TEST_MODE") != null;
 226
 227     private Taxon makeTaxon(Map<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
 228                             TaxonNode higherTaxonNode, boolean isFossil) {
 229
 230         String regNumber = getValue(record, REGISTRATIONNO_PK, false);
 231         String regStr = getValue(record, REGISTRATION, true);
 232         String titleCacheStr = getValue(record, FULLNAME, true);
 233         String nameStr = getValue(record, NAMESTRING, true);
 234         String authorStr = getValue(record, AUTHORSTRING, true);
 235         String nomRefStr = getValue(record, LITSTRING, true);
 236         String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
 237         String notesTxt = getValue(record, NOTESTXT, true);
 238         String caveats = getValue(record, CAVEATS, true);
 239         String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
 240         String fullBasionymStr = getValue(record, FULLBASIONYM, true);
 241         String basionymNameStr = getValue(record, FULLBASIONYM, true);
 242         String synSubstStr = getValue(record, SYNSUBSTSTR, true);
 243         String typeStr = getValue(record, TYPE, true);
 244
 245
 246         String nomRefTitle = null;
 247         String nomRefDetail;
 248         String nomRefPupDate = null;
 249         String nomRefIssue = null;
 250         Partial pupDate = null;
 251
 252         boolean restoreOriginalReference = false;
 253         boolean nameIsValid = true;
 254
 255         // preprocess nomRef: separate citation, reference detail, publishing date
 256         if(!StringUtils.isEmpty(nomRefStr)){
 257             nomRefStr = nomRefStr.trim();
 258
 259             // handle the special case which is hard to parse:
 260             //
 261             // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
 262             if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
 263
 264                 if(bookVariedadesTradicionales == null){
 265                     bookVariedadesTradicionales = ReferenceFactory.newBook();
 266                     bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
 267                     bookVariedadesTradicionales.setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(1997));
 268                     getReferenceService().save(bookVariedadesTradicionales);
 269                 }
 270                 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
 271                 restoreOriginalReference = true;
 272             }
 273
 274             Matcher m = nomRefTokenizeP.matcher(nomRefStr);
 275             if(m.matches()){
 276                 nomRefTitle = m.group("title");
 277                 nomRefDetail = m.group("detail");
 278                 nomRefPupDate = m.group("date").trim();
 279                 nomRefIssue = m.group("issue");
 280
 281                 pupDate = parseDate(regNumber, nomRefPupDate);
 282                 if (pupDate != null) {
 283                     nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
 284                 } else {
 285                     logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
 286                 }
 287             } else {
 288                 nomRefTitle = nomRefStr;
 289             }
 290         }
 291
 292         TaxonName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
 293
 294         // always add the original strings of parsed data as annotation
 295         taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
 296                         "\n -  '" + LITSTRING + "': "+ nomRefStr +
 297                         "\n -  '" + TYPE + "': " + typeStr +
 298                         "\n -  '" + REGISTRATION  + "': " + regStr
 299                 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
 300
 301         if(restoreOriginalReference){
 302             taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
 303         }
 304
 305         if(taxonName.getNomenclaturalReference() != null){
 306             if(pupDate != null) {
 307                 taxonName.getNomenclaturalReference().setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(pupDate));
 308             }
 309             if(nomRefIssue != null) {
 310                 taxonName.getNomenclaturalReference().setVolume(nomRefIssue);
 311             }
 312         }
 313
 314
 315         if(!StringUtils.isEmpty(notesTxt)){
 316             notesTxt = notesTxt.replace("Notes: ", "").trim();
 317             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
 318             nameIsValid = false;
 319
 320         }
 321         if(!StringUtils.isEmpty(caveats)){
 322             caveats = caveats.replace("Caveats: ", "").trim();
 323             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
 324             nameIsValid = false;
 325         }
 326
 327         if(nameIsValid){
 328             // Status is always considered valid if no notes and cavets are set
 329             taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
 330         }
 331
 332         getNameService().save(taxonName);
 333
 334         // Namerelations
 335         if(!StringUtils.isEmpty(authorsSpelling)){
 336             authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
 337
 338             String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
 339             String[] nameStrTokens = StringUtils.split(nameStr, " ");
 340
 341             ArrayUtils.reverse(authorSpellingTokens);
 342             ArrayUtils.reverse(nameStrTokens);
 343
 344             for (int i = 0; i < nameStrTokens.length; i++){
 345                 if(i < authorSpellingTokens.length){
 346                     nameStrTokens[i] = authorSpellingTokens[i];
 347                 }
 348             }
 349             ArrayUtils.reverse(nameStrTokens);
 350
 351             String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
 352             // build the fullnameString of the misspelled name
 353             misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
 354
 355             TaxonName misspelledName = nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
 356             misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null, null);
 357             getNameService().save(misspelledName);
 358         }
 359
 360         // Replaced Synonyms
 361         if(!StringUtils.isEmpty(fullSynSubstStr)){
 362             fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
 363             TaxonName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
 364             replacedSynonymName.addReplacedSynonym(taxonName, null, null, null, null);
 365             getNameService().save(replacedSynonymName);
 366         }
 367
 368         Reference sec = state.getConfig().getSecReference();
 369         Taxon taxon = Taxon.NewInstance(taxonName, sec);
 370
 371         // Basionym
 372         if(fullBasionymStr != null){
 373             fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 374             basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 375             TaxonName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
 376             getNameService().save(basionym);
 377             taxonName.addBasionym(basionym);
 378
 379             Synonym syn = Synonym.NewInstance(basionym, sec);
 380             taxon.addSynonym(syn, SynonymType.HOMOTYPIC_SYNONYM_OF());
 381             getTaxonService().save(syn);
 382         }
 383
 384         // Markers
 385         if(isFossil){
 386             taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
 387         }
 388         if(!nameSet.add(titleCacheStr)){
 389             taxonName.addMarker(Marker.NewInstance(markerDuplicateRegistration(), true));
 390             logger.warn(csvReportLine(regNumber, "Duplicate registration of", titleCacheStr));
 391         }
 392
 393
 394         // Types
 395         if(!StringUtils.isEmpty(typeStr)){
 396
 397             if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
 398                 makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
 399             } else {
 400                 makeNameTypeData(typeStr, taxonName, regNumber, state);
 401             }
 402         }
 403
 404         getTaxonService().save(taxon);
 405
 406         if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
 407             // try to find the genus, it should have been imported already, Genera are coming first in the import file
 408             Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
 409             if(genus != null){
 410                 higherTaxonNode = genus.getTaxonNodes().iterator().next();
 411             } else {
 412                 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
 413             }
 414         }
 415
 416         if(higherTaxonNode != null){
 417             higherTaxonNode.addChildTaxon(taxon, null, null);
 418             getTaxonNodeService().save(higherTaxonNode);
 419         }
 420
 421         if(taxonName.getRank().isGenus()){
 422             ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
 423         }
 424
 425         return taxon;
 426     }
 427
 428     private void makeSpecimenTypeData(String typeStr, TaxonName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
 429
 430         Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
 431
 432         if(m.matches()){
 433             String fieldUnitStr = m.group(TypesName.fieldUnit.name());
 434             // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
 435             FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
 436             if(fieldUnit == null) {
 437                 // create a field unit with only a titleCache using the fieldUnitStr substring
 438                 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
 439                 fieldUnit = FieldUnit.NewInstance();
 440                 fieldUnit.setTitleCache(fieldUnitStr, true);
 441                 getOccurrenceService().save(fieldUnit);
 442             }
 443             getOccurrenceService().save(fieldUnit);
 444
 445             SpecimenOrObservationType specimenType;
 446             if(isFossil){
 447                 specimenType = SpecimenOrObservationType.Fossil;
 448             } else {
 449                 specimenType = SpecimenOrObservationType.PreservedSpecimen;
 450             }
 451
 452             // all others ..
 453             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
 454             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
 455
 456         } else {
 457             // create a field unit with only a titleCache using the full typeStr
 458             FieldUnit fieldUnit = FieldUnit.NewInstance();
 459             fieldUnit.setTitleCache(typeStr, true);
 460             getOccurrenceService().save(fieldUnit);
 461             logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
 462         }
 463         getNameService().save(taxonName);
 464     }
 465
 466     private void makeNameTypeData(String typeStr, IBotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 467
 468         String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
 469         if(nameStr.isEmpty()) {
 470             return;
 471         }
 472
 473         String basionymNameStr = null;
 474         String noteStr = null;
 475         String agentStr = null;
 476
 477         Matcher m;
 478
 479         if(typeStr.startsWith("not to be indicated")){
 480             // Special case:
 481             // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
 482             // FIXME
 483             m = typeNameSpecialSplitPattern.matcher(nameStr);
 484             if(m.matches()){
 485                 nameStr = m.group("name");
 486                 noteStr = m.group("note");
 487                 agentStr = m.group("agent");
 488                 // TODO better import of agent?
 489                 if(agentStr != null){
 490                     noteStr = noteStr + ": " + agentStr;
 491                 }
 492             }
 493         } else {
 494             // Generic case
 495             m = typeNameBasionymPattern.matcher(nameStr);
 496             if (m.find()) {
 497                 basionymNameStr = m.group("basionymName");
 498                 if (basionymNameStr != null) {
 499                     nameStr = nameStr.replace(m.group(0), "");
 500                 }
 501             }
 502
 503             m = typeNameNotePattern.matcher(nameStr);
 504             if (m.find()) {
 505                 noteStr = m.group(1);
 506                 if (noteStr != null) {
 507                     nameStr = nameStr.replace(m.group(0), "");
 508                 }
 509             }
 510         }
 511
 512         TaxonName typeName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 513
 514         if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
 515             logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
 516         }
 517
 518         if(basionymNameStr != null){
 519             TaxonName basionymName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 520             getNameService().save(basionymName);
 521             typeName.addBasionym(basionymName);
 522         }
 523
 524
 525         NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
 526         nameTypeDesignation.setTypeName(typeName);
 527         getNameService().save(typeName);
 528
 529         if(noteStr != null){
 530             nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
 531         }
 532         taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
 533
 534     }
 535
 536     /**
 537      * Currently only parses the collector, fieldNumber and the collection date.
 538      *
 539      * @param fieldUnitStr
 540      * @param regNumber
 541      * @param state
 542      * @return null if the fieldUnitStr could not be parsed
 543      */
 544     private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 545
 546         FieldUnit fieldUnit = null;
 547
 548         Matcher m1 = collectorPattern.matcher(fieldUnitStr);
 549         if(m1.matches()){
 550
 551             String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
 552             String removal = m1.group(1);
 553             if(collectorData == null){
 554                 collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
 555                 removal = m1.group(3);
 556             }
 557             if(collectorData == null){
 558                 collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
 559                 removal = null;
 560             }
 561             if(collectorData == null){
 562                 return null;
 563             }
 564
 565             // the fieldUnitStr is parsable
 566             // remove all collectorData from the fieldUnitStr and use the rest as locality
 567             String locality = null;
 568             if(removal != null){
 569                 locality = fieldUnitStr.replace(removal, "");
 570             }
 571
 572             String collectorStr = null;
 573             String detailStr = null;
 574             Partial date = null;
 575             String fieldNumber = null;
 576
 577             Matcher m2 = collectionDataPattern.matcher(collectorData);
 578             if(m2.matches()){
 579                 collectorStr = m2.group("collector");
 580                 detailStr = m2.group("detail");
 581
 582                 // Try to make sense of the detailStr
 583                 if(detailStr != null){
 584                     detailStr = detailStr.trim();
 585                     // 1. try to parse as date
 586                     date = parseDate(regNumber, detailStr);
 587                     if(date == null){
 588                         // 2. try to parse as number
 589                         if(collectorsNumber.matcher(detailStr).matches()){
 590                             fieldNumber = detailStr;
 591                         }
 592                     }
 593                 }
 594                 if(date == null && fieldNumber == null){
 595                     // detailed parsing not possible, so need fo fallback
 596                     collectorStr = collectorData;
 597                 }
 598             }
 599
 600             if(collectorStr == null) {
 601                 collectorStr = collectorData;
 602             }
 603
 604             fieldUnit = FieldUnit.NewInstance();
 605             GatheringEvent ge = GatheringEvent.NewInstance();
 606             if(locality != null){
 607                 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
 608             }
 609
 610             TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
 611             if(agent == null) {
 612                 agent = Person.NewTitledInstance(collectorStr);
 613                 getAgentService().save(agent);
 614                 state.putAgentBase(collectorStr, agent);
 615             }
 616             ge.setCollector(agent);
 617
 618             if(date != null){
 619                 ge.setGatheringDate(date);
 620             }
 621
 622             getEventBaseService().save(ge);
 623             fieldUnit.setGatheringEvent(ge);
 624
 625             if(fieldNumber != null) {
 626                 fieldUnit.setFieldNumber(fieldNumber);
 627             }
 628             getOccurrenceService().save(fieldUnit);
 629
 630         }
 631
 632         return fieldUnit;
 633     }
 634
 635     protected Partial parseDate(String regNumber, String dateStr) {
 636
 637         Partial pupDate = null;
 638         boolean parseError = false;
 639
 640         String day = null;
 641         String month = null;
 642         String monthName = null;
 643         String year = null;
 644
 645         for(Pattern p : datePatterns){
 646             Matcher m2 = p.matcher(dateStr);
 647             if(m2.matches()){
 648                 try {
 649                     year = m2.group("year");
 650                 } catch (IllegalArgumentException e){
 651                     // named capture group not found
 652                 }
 653                 try {
 654                     month = m2.group("month");
 655                 } catch (IllegalArgumentException e){
 656                     // named capture group not found
 657                 }
 658
 659                 try {
 660                     monthName = m2.group("monthName");
 661                     month = monthFromName(monthName, regNumber);
 662                     if(month == null){
 663                         parseError = true;
 664                     }
 665                 } catch (IllegalArgumentException e){
 666                     // named capture group not found
 667                 }
 668                 try {
 669                     day = m2.group("day");
 670                 } catch (IllegalArgumentException e){
 671                     // named capture group not found
 672                 }
 673
 674                 if(year != null){
 675                     if (year.length() == 2) {
 676                         // it is an abbreviated year from the 19** years
 677                         year = "19" + year;
 678                     }
 679                     break;
 680                 } else {
 681                     parseError = true;
 682                 }
 683             }
 684         }
 685         if(year == null){
 686             parseError = true;
 687         }
 688         List<DateTimeFieldType> types = new ArrayList<>();
 689         List<Integer> values = new ArrayList<>();
 690         if(!parseError) {
 691             types.add(DateTimeFieldType.year());
 692             values.add(Integer.parseInt(year));
 693             if (month != null) {
 694                 types.add(DateTimeFieldType.monthOfYear());
 695                 values.add(Integer.parseInt(month));
 696             }
 697             if (day != null) {
 698                 types.add(DateTimeFieldType.dayOfMonth());
 699                 values.add(Integer.parseInt(day));
 700             }
 701             pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
 702         }
 703         return pupDate;
 704     }
 705
 706     private String monthFromName(String monthName, String regNumber) {
 707
 708         Integer month = monthFromNameMap.get(monthName.toLowerCase());
 709         if(month == null){
 710             logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
 711             return null;
 712         } else {
 713             return month.toString();
 714         }
 715     }
 716
 717
 718     private void addSpecimenTypes(IBotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
 719
 720         if(StringUtils.isEmpty(typeStr)){
 721             return;
 722         }
 723         typeStr = typeStr.trim().replaceAll("\\.$", "");
 724
 725         Collection collection = null;
 726         DerivedUnit specimen = null;
 727
 728         List<DerivedUnit> specimens = new ArrayList<>();
 729         if(multiple){
 730             String[] tokens = typeStr.split("\\s?,\\s?");
 731             for (String t : tokens) {
 732                 // command to  list all complex parsabel types:
 733                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
 734                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
 735
 736                 if(!t.isEmpty()){
 737                     // trying to parse the string
 738                     specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
 739                     if(specimen != null){
 740                         specimens.add(specimen);
 741                     } else {
 742                         // parsing was not successful make simple specimen
 743                         specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
 744                     }
 745                 }
 746             }
 747         } else {
 748             specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
 749             if(specimen != null) {
 750                 specimens.add(specimen);
 751                 // remember current collection
 752                 collection = specimen.getCollection();
 753             } else {
 754                 // parsing was not successful make simple specimen
 755                 specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
 756             }
 757         }
 758
 759         for(DerivedUnit s : specimens){
 760             taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
 761        }
 762     }
 763
 764     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
 765         DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
 766         facade.setTitleCache(titleCache.trim(), true);
 767         specimen = facade.innerDerivedUnit();
 768         return specimen;
 769     }
 770
 771     /**
 772      *
 773      * @param fieldUnit
 774      * @param typeName
 775      * @param collection
 776      * @param text
 777      * @param regNumber
 778      * @return
 779      */
 780     protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 781
 782         DerivedUnit specimen = null;
 783
 784         String collectionCode = null;
 785         String subCollectionStr = null;
 786         String instituteStr = null;
 787         String accessionNumber = null;
 788
 789         boolean unusualAccessionNumber = false;
 790
 791         text = text.trim();
 792
 793         // 1.  For Isotypes often the accession number is noted alone if the
 794         //     preceeding entry has a collection code.
 795         if(typeName .equals(TypesName.isotype) && collection != null){
 796             Matcher m = accessionNumberOnlyPattern.matcher(text);
 797             if(m.matches()){
 798                 try {
 799                     accessionNumber = m.group("accNumber");
 800                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 801                 } catch (IllegalArgumentException e){
 802                     // match group acc_number not found
 803                 }
 804             }
 805         }
 806
 807         //2. try it the 'normal' way
 808         if(specimen == null) {
 809             for (Pattern p : specimenTypePatterns) {
 810                 Matcher m = p.matcher(text);
 811                 if (m.matches()) {
 812                     // collection code or collectionTitle is mandatory
 813                     try {
 814                         collectionCode = m.group("colCode");
 815                     } catch (IllegalArgumentException e){
 816                         // match group colCode not found
 817                     }
 818
 819                     try {
 820                         instituteStr = m.group("institute");
 821                     } catch (IllegalArgumentException e){
 822                         // match group col_name not found
 823                     }
 824
 825                     try {
 826                         subCollectionStr = m.group("subCollection");
 827                     } catch (IllegalArgumentException e){
 828                         // match group subCollection not found
 829                     }
 830                     try {
 831                         accessionNumber = m.group("accNumber");
 832
 833                         // try to improve the accessionNumber
 834                         if(accessionNumber!= null) {
 835                             accessionNumber = accessionNumber.trim();
 836                             Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
 837                             String betterAccessionNumber = null;
 838                             if (m2.matches()) {
 839                                 try {
 840                                     betterAccessionNumber = m.group("accNumber");
 841                                 } catch (IllegalArgumentException e) {
 842                                     // match group acc_number not found
 843                                 }
 844                             }
 845                             if (betterAccessionNumber != null) {
 846                                 accessionNumber = betterAccessionNumber;
 847                             } else {
 848                                 unusualAccessionNumber = true;
 849                             }
 850                         }
 851
 852                     } catch (IllegalArgumentException e){
 853                         // match group acc_number not found
 854                     }
 855
 856                     if(collectionCode == null && instituteStr == null){
 857                         logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
 858                         continue;
 859                     }
 860                     collection = getCollection(collectionCode, instituteStr, subCollectionStr);
 861                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 862                     break;
 863                 }
 864             }
 865         }
 866         if(specimen == null) {
 867             logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
 868         }
 869         if(unusualAccessionNumber){
 870             logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
 871         }
 872         return specimen;
 873     }
 874
 875     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
 876
 877         DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 878         facade.setCollection(collection);
 879         if(accessionNumber != null){
 880             facade.setAccessionNumber(accessionNumber);
 881         }
 882         return facade.innerDerivedUnit();
 883     }
 884
 885     private TaxonName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
 886                                             String authorStr, String nomRefTitle) {
 887
 888         TaxonName taxonName;// cache field for the taxonName.titleCache
 889         String taxonNameTitleCache = null;
 890         Map<String, AnnotationType> nameAnnotations = new HashMap<>();
 891
 892         // TitleCache preprocessing
 893         if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
 894             nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
 895             titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 896             if(authorStr != null) {
 897                 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 898             }
 899         }
 900
 901         // parse the full taxon name
 902         if(!StringUtils.isEmpty(nomRefTitle)){
 903             String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
 904             String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
 905             logger.debug(":::::" + taxonFullNameStr);
 906             taxonName = nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
 907         } else {
 908             taxonName = (TaxonName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
 909         }
 910
 911         taxonNameTitleCache = taxonName.getTitleCache().trim();
 912         if (taxonName.isProtectedTitleCache()) {
 913             logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
 914         } else {
 915
 916             boolean doRestoreTitleCacheStr = false;
 917
 918             // Check if titleCache and nameCache are plausible
 919             String titleCacheCompareStr = titleCacheStr;
 920             String nameCache = taxonName.getNameCache();
 921             String nameCompareStr = nameStr;
 922             if(taxonName.isBinomHybrid()){
 923                 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
 924                 nameCompareStr = nameCompareStr.replace(" x ", " ×");
 925             }
 926             if(taxonName.isMonomHybrid()){
 927                 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
 928                 nameCompareStr = nameCompareStr.replace("^X ", "× ");
 929             }
 930             if(authorStr != null && authorStr.contains(" et ")){
 931                 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
 932             }
 933             if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
 934                 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
 935                 doRestoreTitleCacheStr = true;
 936             }
 937             if (!nameCache.trim().equals(nameCompareStr)) {
 938                 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
 939             }
 940
 941             //  Author
 942             //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
 943             //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
 944             //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
 945             //    doRestoreTitleCacheStr = true;
 946             //}
 947
 948             if(doRestoreTitleCacheStr){
 949                 taxonName.setTitleCache(titleCacheStr, true);
 950             }
 951
 952             // deduplicate
 953             replaceAuthorNamesAndNomRef(state, taxonName);
 954         }
 955
 956         // Annotations
 957         if(!nameAnnotations.isEmpty()){
 958             for(String text : nameAnnotations.keySet()){
 959                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
 960             }
 961         }
 962
 963         taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
 964
 965         getNameService().save(taxonName);
 966
 967         return taxonName;
 968     }
 969
 970     /**
 971      * @param state
 972      * @return
 973      */
 974     private TaxonNode getClassificationRootNode(IAPTImportState state) {
 975
 976      //   Classification classification = state.getClassification();
 977      //   if (classification == null){
 978      //       IAPTImportConfigurator config = state.getConfig();
 979      //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
 980      //       classification.setUuid(config.getClassificationUuid());
 981      //       classification.setReference(config.getSecReference());
 982      //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
 983      //   }
 984         TaxonNode rootNode = state.getRootNode();
 985         if (rootNode == null){
 986             rootNode = getTaxonNodeService().find(ROOT_UUID);
 987         }
 988         if (rootNode == null){
 989             Classification classification = state.getClassification();
 990             if (classification == null){
 991                 Reference sec = state.getSecReference();
 992                 String classificationName = state.getConfig().getClassificationName();
 993                 Language language = Language.DEFAULT();
 994                 classification = Classification.NewInstance(classificationName, sec, language);
 995                 state.setClassification(classification);
 996                 classification.setUuid(state.getConfig().getClassificationUuid());
 997                 classification.getRootNode().setUuid(ROOT_UUID);
 998                 getClassificationService().save(classification);
 999             }
1000             rootNode = classification.getRootNode();
1001             state.setRootNode(rootNode);
1002         }
1003         return rootNode;
1004     }
1005
1006     private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
1007
1008         Collection superCollection = null;
1009         if(subCollectionStr != null){
1010             superCollection = getCollection(collectionCode, instituteStr, null);
1011             collectionCode = subCollectionStr;
1012             instituteStr = null;
1013         }
1014
1015         final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
1016
1017         Collection collection = collectionMap.get(key);
1018
1019         if(collection == null) {
1020             collection = Collection.NewInstance();
1021             collection.setCode(collectionCode);
1022             if(instituteStr != null){
1023                 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
1024             }
1025             if(superCollection != null){
1026                 collection.setSuperCollection(superCollection);
1027             }
1028             collectionMap.put(key, collection);
1029             if(!_testMode) {
1030                 getCollectionService().save(collection);
1031             }
1032         }
1033
1034         return collection;
1035     }
1036
1037
1038     /**
1039      * @param record
1040      * @param originalKey
1041      * @param doUnescapeHtmlEntities
1042      * @return
1043      */
1044     private String getValue(Map<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
1045         String value = record.get(originalKey);
1046
1047         value = fixCharacters(value);
1048
1049         if (! StringUtils.isBlank(value)) {
1050                 if (logger.isDebugEnabled()) {
1051                     logger.debug(originalKey + ": " + value);
1052                 }
1053                 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1054             if(doUnescapeHtmlEntities){
1055                 value = StringEscapeUtils.unescapeHtml(value);
1056             }
1057                 return value.trim();
1058         }else{
1059                 return null;
1060         }
1061     }
1062
1063     /**
1064      * Fixes broken characters.
1065      * For details see
1066      * https://dev.e-taxonomy.eu/redmine/issues/6035
1067      *
1068      * @param value
1069      * @return
1070      */
1071     private String fixCharacters(String value) {
1072
1073         value = StringUtils.replace(value, "s$K", "š");
1074         value = StringUtils.replace(value, "n$K", "ň");
1075         value = StringUtils.replace(value, "e$K", "ě");
1076         value = StringUtils.replace(value, "r$K", "ř");
1077         value = StringUtils.replace(value, "c$K", "č");
1078         value = StringUtils.replace(value, "z$K", "ž");
1079         value = StringUtils.replace(value, "S>U$K", "Š");
1080         value = StringUtils.replace(value, "C>U$K", "Č");
1081         value = StringUtils.replace(value, "R>U$K", "Ř");
1082         value = StringUtils.replace(value, "Z>U$K", "Ž");
1083         value = StringUtils.replace(value, "g$K", "ǧ");
1084         value = StringUtils.replace(value, "s$A", "ś");
1085         value = StringUtils.replace(value, "n$A", "ń");
1086         value = StringUtils.replace(value, "c$A", "ć");
1087         value = StringUtils.replace(value, "e$E", "ę");
1088         value = StringUtils.replace(value, "o$H", "õ");
1089         value = StringUtils.replace(value, "s$C", "ş");
1090         value = StringUtils.replace(value, "t$C", "ț");
1091         value = StringUtils.replace(value, "S>U$C", "Ş");
1092         value = StringUtils.replace(value, "a$O", "å");
1093         value = StringUtils.replace(value, "A>U$O", "Å");
1094         value = StringUtils.replace(value, "u$O", "ů");
1095         value = StringUtils.replace(value, "g$B", "ğ");
1096         value = StringUtils.replace(value, "g$B", "ĕ");
1097         value = StringUtils.replace(value, "a$B", "ă");
1098         value = StringUtils.replace(value, "l$/", "ł");
1099         value = StringUtils.replace(value, ">i", "ı");
1100         value = StringUtils.replace(value, "i$U", "ï");
1101         // Special-cases
1102         value = StringUtils.replace(value, "&yacute", "ý");
1103         value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1104         value = StringUtils.replace(value, "E>U$D", "З");
1105         value = StringUtils.replace(value, "S>U$E", "Ş");
1106         value = StringUtils.replace(value, "s$E", "ş");
1107
1108         value = StringUtils.replace(value, "c$k", "č");
1109         value = StringUtils.replace(value, " U$K", " Š");
1110
1111         value = StringUtils.replace(value, "O>U>!", "Ø");
1112         value = StringUtils.replace(value, "o>!", "ø");
1113         value = StringUtils.replace(value, "S$K", "Ŝ");
1114         value = StringUtils.replace(value, ">l", "ğ");
1115
1116         value = StringUtils.replace(value, "§B>i", "ł");
1117         value = StringUtils.replace(value, "¤", "ń");
1118
1119         return value;
1120     }
1121
1122
1123     /**
1124          *  Stores taxa records in DB
1125          */
1126         @Override
1127     protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1128
1129         if(excludeFromImport(state)){
1130             return;
1131         }
1132
1133         String lineNumber = "L#" + state.getCurrentLine() + ": ";
1134         LogUtils.setLevel(logger, Level.DEBUG);
1135         Map<String, String> record = state.getOriginalRecord();
1136         logger.debug(lineNumber + record.toString());
1137
1138         Set<String> keys = record.keySet();
1139         for (String key: keys) {
1140             if (! expectedKeys.contains(key)){
1141                 logger.warn(lineNumber + "Unexpected Key: " + key);
1142             }
1143         }
1144
1145         String reg_id = record.get(REGISTRATIONNO_PK);
1146
1147         //higherTaxon
1148         String higherTaxaString = record.get(HIGHERTAXON);
1149         boolean isFossil = false;
1150         if(higherTaxaString.startsWith("FOSSIL ")){
1151             higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1152             isFossil = true;
1153         }
1154         TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1155
1156        //Taxon
1157         Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1158         if (taxon == null){
1159             logger.warn(lineNumber + "taxon could not be created and is null");
1160             return;
1161         }
1162         ((IAPTImportState)state).setCurrentTaxon(taxon);
1163
1164         // Registration
1165         IAPTRegData regData = makeIAPTRegData(state);
1166         ObjectMapper mapper = new ObjectMapper();
1167         try {
1168             String regdataJson = mapper.writeValueAsString(regData);
1169             Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1170             getNameService().save(taxon.getName());
1171         } catch (JsonProcessingException e) {
1172             logger.error("Error on converting IAPTRegData", e);
1173         }
1174
1175         logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1176                 return;
1177     }
1178
1179     private boolean excludeFromImport(SimpleExcelTaxonImportState<CONFIG> state) {
1180         if(state.getConfig().isDoAlgeaeOnly()){
1181             boolean include = false;
1182             String higherTaxon = getValue(state.getOriginalRecord(), HIGHERTAXON, true);
1183             String fullNameStr = getValue(state.getOriginalRecord(), FULLNAME, true);
1184             include |= higherTaxon.matches(".*?PHYCEAE(?:$|\\s+)");
1185             for(String test : new String[]{
1186                     "Bolidophyceae ",
1187                     "Phaeothamniophyceae ",
1188                     "Bolidomonadales ",
1189                     "Bolidomonadaceae ",
1190                     "Aureoumbra ",
1191                     "Bolidomonas ",
1192                     "Seagriefia ",
1193                     "Navicula "
1194                 }) {
1195                 include |= fullNameStr.startsWith(test);
1196             }
1197             return !include;
1198         }
1199
1200         return false;
1201     }
1202
1203     private ExtensionType getExtensionTypeIAPTRegData() {
1204         if(extensionTypeIAPTRegData == null){
1205             extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1206             getTermService().save(extensionTypeIAPTRegData);
1207         }
1208         return extensionTypeIAPTRegData;
1209     }
1210
1211     private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1212
1213         Map<String, String> record = state.getOriginalRecord();
1214         String registrationStr = getValue(record, REGISTRATION);
1215         String regDateStr = getValue(record, REGDATE);
1216         String regStr = getValue(record, REGISTRATION, true);
1217
1218         String dateStr = null;
1219         String office = null;
1220         Integer regID = null;
1221         Integer formNo = null;
1222
1223         Matcher m = registrationPattern.matcher(registrationStr);
1224         if(m.matches()){
1225             dateStr = m.group("regdate");
1226             if(parseDate( regStr, dateStr) == null){
1227                 // check for valid dates
1228                 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1229             }
1230             office = m.group("office");
1231             regID = Integer.valueOf(m.group("regid"));
1232             try {
1233                 formNo = Integer.valueOf(m.group("formNo"));
1234             } catch(IllegalArgumentException e){
1235                 // ignore
1236             }
1237         } else {
1238             logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1239         }
1240         IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1241         return regData;
1242     }
1243
1244     private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1245         String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1246         TaxonNode higherTaxonNode = null;
1247
1248         ITaxonTreeNode rootNode = getClassificationRootNode(state);
1249         for (String htn :  higherTaxaNames) {
1250             htn = StringUtils.capitalize(htn.trim());
1251             Taxon higherTaxon = state.getHigherTaxon(htn);
1252             if (higherTaxon != null){
1253                 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1254             }else{
1255                 IBotanicalName name = makeHigherTaxonName(state, htn);
1256                 Reference sec = state.getSecReference();
1257                 higherTaxon = Taxon.NewInstance(name, sec);
1258                 getTaxonService().save(higherTaxon);
1259                 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1260                 state.putHigherTaxon(htn, higherTaxon);
1261                 getClassificationService().saveTreeNode(higherTaxonNode);
1262             }
1263             rootNode = higherTaxonNode;
1264         }
1265         return higherTaxonNode;
1266     }
1267
1268     private IBotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1269
1270         Rank rank = guessRank(name);
1271
1272         IBotanicalName taxonName = TaxonNameFactory.NewBotanicalInstance(rank);
1273         taxonName.addSource(makeOriginalSource(state));
1274         taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1275         return taxonName;
1276     }
1277
1278     private Rank guessRank(String name) {
1279
1280         // normalize
1281         name = name.replaceAll("\\(.*\\)", "").trim();
1282
1283         if(name.matches("^Plantae$|^Fungi$")){
1284            return Rank.KINGDOM();
1285         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1286            return rankFamilyIncertisSedis();
1287         } else if(name.matches(".*phyta$|.*mycota$")){
1288            return Rank.PHYLUM();
1289         } else if(name.matches(".*phytina$|.*mycotina$")){
1290            return Rank.SUBPHYLUM();
1291         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1292             return rankUnrankedSupraGeneric();
1293         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1294            return Rank.CLASS();
1295         } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1296            return Rank.SUBCLASS();
1297         } else if(name.matches(".*ales$")){
1298            return Rank.ORDER();
1299         } else if(name.matches(".*ineae$")){
1300            return Rank.SUBORDER();
1301         } else if(name.matches(".*aceae$")){
1302             return Rank.FAMILY();
1303         } else if(name.matches(".*oideae$")){
1304            return Rank.SUBFAMILY();
1305         } else
1306         //    if(name.matches(".*eae$")){
1307         //    return Rank.TRIBE();
1308         // } else
1309             if(name.matches(".*inae$")){
1310            return Rank.SUBTRIBE();
1311         } else if(name.matches(".*ae$")){
1312            return Rank.FAMILY();
1313         }
1314         return Rank.UNKNOWN_RANK();
1315     }
1316
1317     private Rank rankUnrankedSupraGeneric() {
1318
1319         if(rankUnrankedSupraGeneric == null){
1320             rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1321             getTermService().save(rankUnrankedSupraGeneric);
1322         }
1323         return rankUnrankedSupraGeneric;
1324     }
1325
1326     private Rank rankFamilyIncertisSedis() {
1327
1328         if(familyIncertisSedis == null){
1329             familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1330             getTermService().save(familyIncertisSedis);
1331         }
1332         return familyIncertisSedis;
1333     }
1334
1335     private AnnotationType annotationTypeCaveats(){
1336         if(annotationTypeCaveats == null){
1337             annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1338             getTermService().save(annotationTypeCaveats);
1339         }
1340         return annotationTypeCaveats;
1341     }
1342
1343
1344     /**
1345      * @param state
1346      * @return
1347      */
1348     private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1349         return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1350     }
1351
1352
1353     private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1354         Reference ref = state.getReference(uuidRef);
1355         if (ref == null){
1356             ref = getReferenceService().find(uuidRef);
1357             state.putReference(uuidRef, ref);
1358         }
1359         return ref;
1360     }
1361
1362     private MarkerType markerTypeFossil(){
1363         if(this.markerTypeFossil == null){
1364             markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1365             getTermService().save(this.markerTypeFossil);
1366         }
1367         return markerTypeFossil;
1368     }
1369
1370     private MarkerType markerDuplicateRegistration(){
1371         if(this.duplicateRegistration == null){
1372             duplicateRegistration = MarkerType.NewInstance("duplicateRegistration", "duplicateRegistration", null);
1373             getTermService().save(this.duplicateRegistration);
1374         }
1375         return markerTypeFossil;
1376     }
1377
1378     private String csvReportLine(String regId, String message, String ... fields){
1379         StringBuilder out = new StringBuilder("regID#");
1380         out.append(regId).append(",\"").append(message).append('"');
1381
1382         for(String f : fields){
1383             out.append(",\"").append(f).append('"');
1384         }
1385         return out.toString();
1386     }
1387
1388
1389 }