app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java

   1 /**
   2  * Copyright (C) 2007 EDIT
   3  * European Distributed Institute of Taxonomy
   4  * http://www.e-taxonomy.eu
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version 1.1
   7  * See LICENSE.TXT at the top of this package for the full license terms.
   8  */
   9
  10 package eu.etaxonomy.cdm.io.iapt;
  11
  12 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
  13 import eu.etaxonomy.cdm.common.CdmUtils;
  14 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
  15 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
  16 import eu.etaxonomy.cdm.model.agent.Institution;
  17 import eu.etaxonomy.cdm.model.agent.Person;
  18 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
  19 import eu.etaxonomy.cdm.model.common.*;
  20 import eu.etaxonomy.cdm.model.name.*;
  21 import eu.etaxonomy.cdm.model.occurrence.*;
  22 import eu.etaxonomy.cdm.model.occurrence.Collection;
  23 import eu.etaxonomy.cdm.model.reference.Reference;
  24 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  25 import eu.etaxonomy.cdm.model.reference.ReferenceType;
  26 import eu.etaxonomy.cdm.model.taxon.*;
  27 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  28 import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
  29 import org.apache.commons.lang.ArrayUtils;
  30 import org.apache.commons.lang.StringEscapeUtils;
  31 import org.apache.commons.lang.StringUtils;
  32 import org.apache.log4j.Level;
  33 import org.apache.log4j.Logger;
  34 import org.joda.time.DateTimeFieldType;
  35 import org.joda.time.Partial;
  36 import org.joda.time.format.DateTimeFormat;
  37 import org.joda.time.format.DateTimeFormatter;
  38 import org.springframework.stereotype.Component;
  39
  40 import java.util.*;
  41 import java.util.regex.Matcher;
  42 import java.util.regex.Pattern;
  43
  44 /**
  45  * @author a.mueller
  46  * @created 05.01.2016
  47  */
  48
  49 @Component("iAPTExcelImport")
  50 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
  51     private static final long serialVersionUID = -747486709409732371L;
  52     private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
  53     public static final String ANNOTATION_MARKER_STRING = "[*]";
  54
  55
  56     private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
  57
  58     private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
  59
  60     private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
  61     private final static String HIGHERTAXON= "HigherTaxon";
  62     private final static String FULLNAME= "FullName";
  63     private final static String AUTHORSSPELLING= "AuthorsSpelling";
  64     private final static String LITSTRING= "LitString";
  65     private final static String REGISTRATION= "Registration";
  66     private final static String TYPE= "Type";
  67     private final static String CAVEATS= "Caveats";
  68     private final static String FULLBASIONYM= "FullBasionym";
  69     private final static String FULLSYNSUBST= "FullSynSubst";
  70     private final static String NOTESTXT= "NotesTxt";
  71     private final static String REGDATE= "RegDate";
  72     private final static String NAMESTRING= "NameString";
  73     private final static String BASIONYMSTRING= "BasionymString";
  74     private final static String SYNSUBSTSTR= "SynSubstStr";
  75     private final static String AUTHORSTRING= "AuthorString";
  76
  77     private  static List<String> expectedKeys= Arrays.asList(new String[]{
  78             REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
  79
  80     private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
  81     private static final Pattern[] datePatterns = new Pattern[]{
  82             // NOTE:
  83             // The order of the patterns is extremely important!!!
  84             //
  85             // all patterns cover the years 1700 - 1999
  86             Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
  87             Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
  88             Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
  89             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
  90             Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
  91             Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
  92             Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
  93             Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
  94             Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
  95             Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
  96         };
  97     private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
  98
  99     private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
 100     private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
 101     private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
 102
 103     private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
 104     private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
 105     private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
 106
 107     // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
 108     private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
 109
 110     private static final Pattern[] specimenTypePatterns = new Pattern[]{
 111             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
 112             Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
 113             Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
 114             Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
 115     };
 116
 117     private static Map<String, Integer> monthFromNameMap = new HashMap<>();
 118
 119     static {
 120         String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
 121         String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
 122         String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
 123         String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
 124         String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
 125         String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
 126         String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
 127         String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
 128         String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
 129         String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
 130
 131         String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
 132
 133         for (String[] months: perLang) {
 134             for(int m = 1; m < 13; m++){
 135                 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
 136             }
 137         }
 138
 139         // special cases
 140         monthFromNameMap.put("mar", 3);
 141         monthFromNameMap.put("dec", 12);
 142         monthFromNameMap.put("Februari", 2);
 143     }
 144
 145
 146     DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
 147
 148     private Map<String, Collection> collectionMap = new HashMap<>();
 149
 150
 151     enum TypesName {
 152         fieldUnit, holotype, isotype;
 153
 154         public SpecimenTypeDesignationStatus status(){
 155             switch (this) {
 156                 case holotype:
 157                     return SpecimenTypeDesignationStatus.HOLOTYPE();
 158                 case isotype:
 159                     return SpecimenTypeDesignationStatus.ISOTYPE();
 160                 default:
 161                     return null;
 162             }
 163         }
 164     }
 165
 166     private MarkerType markerTypeFossil = null;
 167     private Rank rankUnrankedSupraGeneric = null;
 168     private Rank familyIncertisSedis = null;
 169     private AnnotationType annotationTypeCaveats = null;
 170
 171     private Reference bookVariedadesTradicionales = null;
 172
 173     private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
 174                             TaxonNode higherTaxonNode, boolean isFossil) {
 175
 176         String regNumber = getValue(record, REGISTRATIONNO_PK, false);
 177         String regStr = getValue(record, REGISTRATION, true);
 178         String titleCacheStr = getValue(record, FULLNAME, true);
 179         String nameStr = getValue(record, NAMESTRING, true);
 180         String authorStr = getValue(record, AUTHORSTRING, true);
 181         String nomRefStr = getValue(record, LITSTRING, true);
 182         String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
 183         String notesTxt = getValue(record, NOTESTXT, true);
 184         String caveats = getValue(record, CAVEATS, true);
 185         String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
 186         String fullBasionymStr = getValue(record, FULLBASIONYM, true);
 187         String basionymNameStr = getValue(record, FULLBASIONYM, true);
 188         String synSubstStr = getValue(record, SYNSUBSTSTR, true);
 189         String typeStr = getValue(record, TYPE, true);
 190
 191
 192         String nomRefTitle = null;
 193         String nomRefDetail;
 194         String nomRefPupDate = null;
 195         String nomRefIssue = null;
 196         Partial pupDate = null;
 197
 198         boolean restoreOriginalReference = false;
 199
 200         // preprocess nomRef: separate citation, reference detail, publishing date
 201         if(!StringUtils.isEmpty(nomRefStr)){
 202             nomRefStr = nomRefStr.trim();
 203
 204             // handle the special case which is hard to parse:
 205             //
 206             // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
 207             if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
 208
 209                 if(bookVariedadesTradicionales == null){
 210                     bookVariedadesTradicionales = ReferenceFactory.newBook();
 211                     bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
 212                     bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
 213                     getReferenceService().save(bookVariedadesTradicionales);
 214                 }
 215                 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
 216                 restoreOriginalReference = true;
 217             }
 218
 219             Matcher m = nomRefTokenizeP.matcher(nomRefStr);
 220             if(m.matches()){
 221                 nomRefTitle = m.group("title");
 222                 nomRefDetail = m.group("detail");
 223                 nomRefPupDate = m.group("date").trim();
 224                 nomRefIssue = m.group("issue");
 225
 226                 pupDate = parseDate(regNumber, nomRefPupDate);
 227                 if (pupDate != null) {
 228                     nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
 229                 } else {
 230                     logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
 231                 }
 232             } else {
 233                 nomRefTitle = nomRefStr;
 234             }
 235         }
 236
 237         BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
 238
 239         // always add the original strings of parsed data as annotation
 240         taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
 241                         "\n -  '" + LITSTRING + "': "+ nomRefStr +
 242                         "\n -  '" + TYPE + "': " + typeStr +
 243                         "\n -  '" + REGISTRATION  + "': " + regStr
 244                 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
 245
 246         if(restoreOriginalReference){
 247             taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
 248         }
 249         if(pupDate != null) {
 250             taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
 251         }
 252         if(nomRefIssue != null) {
 253             ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
 254         }
 255
 256
 257         if(!StringUtils.isEmpty(notesTxt)){
 258             notesTxt = notesTxt.replace("Notes: ", "").trim();
 259             taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
 260         }
 261         if(!StringUtils.isEmpty(caveats)){
 262             caveats = caveats.replace("Caveats: ", "").trim();
 263             taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
 264         }
 265
 266         getNameService().save(taxonName);
 267
 268         // Namerelations
 269         if(!StringUtils.isEmpty(authorsSpelling)){
 270             authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
 271
 272             String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
 273             String[] nameStrTokens = StringUtils.split(nameStr, " ");
 274
 275             ArrayUtils.reverse(authorSpellingTokens);
 276             ArrayUtils.reverse(nameStrTokens);
 277
 278             for (int i = 0; i < nameStrTokens.length; i++){
 279                 if(i < authorSpellingTokens.length){
 280                     nameStrTokens[i] = authorSpellingTokens[i];
 281                 }
 282             }
 283             ArrayUtils.reverse(nameStrTokens);
 284
 285             String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
 286             // build the fullnameString of the misspelled name
 287             misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
 288
 289             TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
 290             misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
 291             getNameService().save(misspelledName);
 292         }
 293
 294         // Replaced Synonyms
 295         if(!StringUtils.isEmpty(fullSynSubstStr)){
 296             fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
 297             BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
 298             replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
 299             getNameService().save(replacedSynonymName);
 300         }
 301
 302         Reference sec = state.getConfig().getSecReference();
 303         Taxon taxon = Taxon.NewInstance(taxonName, sec);
 304
 305         // Basionym
 306         if(fullBasionymStr != null){
 307             fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 308             basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
 309             BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
 310             getNameService().save(basionym);
 311             taxonName.addBasionym(basionym);
 312
 313             Synonym syn = Synonym.NewInstance(basionym, sec);
 314             taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
 315             getTaxonService().save(syn);
 316         }
 317
 318         // Markers
 319         if(isFossil){
 320             taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
 321         }
 322
 323         // Types
 324         if(!StringUtils.isEmpty(typeStr)){
 325
 326             if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
 327                 makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
 328             } else {
 329                 makeNameTypeData(typeStr, taxonName, regNumber, state);
 330             }
 331         }
 332
 333         getTaxonService().save(taxon);
 334         if(higherTaxonNode != null){
 335             higherTaxonNode.addChildTaxon(taxon, null, null);
 336             getTaxonNodeService().save(higherTaxonNode);
 337         }
 338
 339         return taxon;
 340     }
 341
 342     private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 343
 344         Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
 345
 346         if(m.matches()){
 347             String fieldUnitStr = m.group(TypesName.fieldUnit.name());
 348             // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
 349             FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
 350             if(fieldUnit == null) {
 351                 // create a field unit with only a titleCache using the fieldUnitStr substring
 352                 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
 353                 fieldUnit = FieldUnit.NewInstance();
 354                 fieldUnit.setTitleCache(fieldUnitStr, true);
 355                 getOccurrenceService().save(fieldUnit);
 356             }
 357             getOccurrenceService().save(fieldUnit);
 358
 359             // all others ..
 360             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
 361             addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
 362
 363         } else {
 364             // create a field unit with only a titleCache using the full typeStr
 365             FieldUnit fieldUnit = FieldUnit.NewInstance();
 366             fieldUnit.setTitleCache(typeStr, true);
 367             getOccurrenceService().save(fieldUnit);
 368             logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
 369         }
 370         getNameService().save(taxonName);
 371     }
 372
 373     private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 374
 375         String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
 376         if(nameStr.isEmpty()) {
 377             return;
 378         }
 379
 380         String basionymNameStr = null;
 381         String noteStr = null;
 382         String agentStr = null;
 383
 384         Matcher m;
 385
 386         if(typeStr.startsWith("not to be indicated")){
 387             // Special case:
 388             // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
 389             // FIXME
 390             m = typeNameSpecialSplitPattern.matcher(nameStr);
 391             if(m.matches()){
 392                 nameStr = m.group("name");
 393                 noteStr = m.group("note");
 394                 agentStr = m.group("agent");
 395                 // TODO better import of agent?
 396                 if(agentStr != null){
 397                     noteStr = noteStr + ": " + agentStr;
 398                 }
 399             }
 400         } else {
 401             // Generic case
 402             m = typeNameBasionymPattern.matcher(nameStr);
 403             if (m.find()) {
 404                 basionymNameStr = m.group("basionymName");
 405                 if (basionymNameStr != null) {
 406                     nameStr = nameStr.replace(m.group(0), "");
 407                 }
 408             }
 409
 410             m = typeNameNotePattern.matcher(nameStr);
 411             if (m.find()) {
 412                 noteStr = m.group(1);
 413                 if (noteStr != null) {
 414                     nameStr = nameStr.replace(m.group(0), "");
 415                 }
 416             }
 417         }
 418
 419         BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 420
 421         if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
 422             logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
 423         }
 424
 425         if(basionymNameStr != null){
 426             BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
 427             getNameService().save(basionymName);
 428             typeName.addBasionym(basionymName);
 429         }
 430
 431
 432         NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
 433         nameTypeDesignation.setTypeName(typeName);
 434         getNameService().save(typeName);
 435
 436         if(noteStr != null){
 437             nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
 438         }
 439         taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
 440
 441     }
 442
 443     /**
 444      * Currently only parses the collector, fieldNumber and the collection date.
 445      *
 446      * @param fieldUnitStr
 447      * @param regNumber
 448      * @param state
 449      * @return null if the fieldUnitStr could not be parsed
 450      */
 451     private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
 452
 453         FieldUnit fieldUnit = null;
 454
 455         Matcher m1 = collectorPattern.matcher(fieldUnitStr);
 456         if(m1.matches()){
 457
 458             String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
 459             String removal = m1.group(1);
 460             if(collectorData == null){
 461                 collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
 462                 removal = m1.group(3);
 463             }
 464             if(collectorData == null){
 465                 return null;
 466             }
 467
 468             // the fieldUnitStr is parsable
 469             // remove all collectorData from the fieldUnitStr and use the rest as locality
 470             String locality = fieldUnitStr.replace(removal, "");
 471
 472             String collectorStr = null;
 473             String detailStr = null;
 474             Partial date = null;
 475             String fieldNumber = null;
 476
 477             Matcher m2 = collectionDataPattern.matcher(collectorData);
 478             if(m2.matches()){
 479                 collectorStr = m2.group("collector");
 480                 detailStr = m2.group("detail");
 481
 482                 // Try to make sense of the detailStr
 483                 if(detailStr != null){
 484                     detailStr = detailStr.trim();
 485                     // 1. try to parse as date
 486                     date = parseDate(regNumber, detailStr);
 487                     if(date == null){
 488                         // 2. try to parse as number
 489                         if(collectorsNumber.matcher(detailStr).matches()){
 490                             fieldNumber = detailStr;
 491                         }
 492                     }
 493                 }
 494                 if(date == null && fieldNumber == null){
 495                     // detailed parsing not possible, so need fo fallback
 496                     collectorStr = collectorData;
 497                 }
 498             }
 499
 500             if(collectorStr == null) {
 501                 collectorStr = collectorData;
 502             }
 503
 504             fieldUnit = FieldUnit.NewInstance();
 505             GatheringEvent ge = GatheringEvent.NewInstance();
 506             ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
 507
 508             TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
 509             if(agent == null) {
 510                 agent = Person.NewTitledInstance(collectorStr);
 511                 getAgentService().save(agent);
 512                 state.putAgentBase(collectorStr, agent);
 513             }
 514             ge.setCollector(agent);
 515
 516             if(date != null){
 517                 ge.setGatheringDate(date);
 518             }
 519
 520             getEventBaseService().save(ge);
 521             fieldUnit.setGatheringEvent(ge);
 522
 523             if(fieldNumber != null) {
 524                 fieldUnit.setFieldNumber(fieldNumber);
 525             }
 526             getOccurrenceService().save(fieldUnit);
 527
 528         }
 529
 530         return fieldUnit;
 531     }
 532
 533     private Partial parseDate(String regNumber, String dateStr) {
 534
 535         Partial pupDate = null;
 536         boolean parseError = false;
 537
 538         String day = null;
 539         String month = null;
 540         String monthName = null;
 541         String year = null;
 542
 543         for(Pattern p : datePatterns){
 544             Matcher m2 = p.matcher(dateStr);
 545             if(m2.matches()){
 546                 try {
 547                     year = m2.group("year");
 548                 } catch (IllegalArgumentException e){
 549                     // named capture group not found
 550                 }
 551                 try {
 552                     month = m2.group("month");
 553                 } catch (IllegalArgumentException e){
 554                     // named capture group not found
 555                 }
 556
 557                 try {
 558                     monthName = m2.group("monthName");
 559                     month = monthFromName(monthName, regNumber);
 560                     if(month == null){
 561                         parseError = true;
 562                     }
 563                 } catch (IllegalArgumentException e){
 564                     // named capture group not found
 565                 }
 566                 try {
 567                     day = m2.group("day");
 568                 } catch (IllegalArgumentException e){
 569                     // named capture group not found
 570                 }
 571
 572                 if(year != null){
 573                     if (year.length() == 2) {
 574                         // it is an abbreviated year from the 19** years
 575                         year = "19" + year;
 576                     }
 577                     break;
 578                 } else {
 579                     parseError = true;
 580                 }
 581             }
 582         }
 583         if(year == null){
 584             parseError = true;
 585         }
 586         List<DateTimeFieldType> types = new ArrayList<>();
 587         List<Integer> values = new ArrayList<>();
 588         if(!parseError) {
 589             types.add(DateTimeFieldType.year());
 590             values.add(Integer.parseInt(year));
 591             if (month != null) {
 592                 types.add(DateTimeFieldType.monthOfYear());
 593                 values.add(Integer.parseInt(month));
 594             }
 595             if (day != null) {
 596                 types.add(DateTimeFieldType.dayOfMonth());
 597                 values.add(Integer.parseInt(day));
 598             }
 599             pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
 600         }
 601         return pupDate;
 602     }
 603
 604     private String monthFromName(String monthName, String regNumber) {
 605
 606         Integer month = monthFromNameMap.get(monthName.toLowerCase());
 607         if(month == null){
 608             logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
 609             return null;
 610         } else {
 611             return month.toString();
 612         }
 613     }
 614
 615
 616     private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
 617
 618         if(StringUtils.isEmpty(typeStr)){
 619             return;
 620         }
 621         typeStr = typeStr.trim().replaceAll("\\.$", "");
 622
 623         Collection collection = null;
 624         DerivedUnit specimen = null;
 625
 626         List<DerivedUnit> specimens = new ArrayList<>();
 627         if(multiple){
 628             String[] tokens = typeStr.split("\\s?,\\s?");
 629             for (String t : tokens) {
 630                 // command to  list all complex parsabel types:
 631                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
 632                 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
 633
 634                 if(!t.isEmpty()){
 635                     // trying to parse the string
 636                     specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
 637                     if(specimen != null){
 638                         specimens.add(specimen);
 639                     } else {
 640                         // parsing was not successful make simple specimen
 641                         specimens.add(makeSpecimenType(fieldUnit, t));
 642                     }
 643                 }
 644             }
 645         } else {
 646             specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
 647             if(specimen != null) {
 648                 specimens.add(specimen);
 649                 // remember current collection
 650                 collection = specimen.getCollection();
 651             } else {
 652                 // parsing was not successful make simple specimen
 653                 specimens.add(makeSpecimenType(fieldUnit, typeStr));
 654             }
 655         }
 656
 657         for(DerivedUnit s : specimens){
 658             taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
 659        }
 660     }
 661
 662     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
 663         DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 664         facade.setTitleCache(titleCache.trim(), true);
 665         specimen = facade.innerDerivedUnit();
 666         return specimen;
 667     }
 668
 669     /**
 670      *
 671      * @param fieldUnit
 672      * @param typeName
 673      * @param collection
 674      * @param text
 675      * @param regNumber
 676      * @return
 677      */
 678     private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
 679
 680         DerivedUnit specimen = null;
 681
 682         String collectionCode = null;
 683         String subCollectionStr = null;
 684         String instituteStr = null;
 685         String accessionNumber = null;
 686
 687         boolean unusualAccessionNumber = false;
 688
 689         text = text.trim();
 690
 691         // 1.  For Isotypes often the accession number is noted alone if the
 692         //     preceeding entry has a collection code.
 693         if(typeName .equals(TypesName.isotype) && collection != null){
 694             Matcher m = accessionNumberOnlyPattern.matcher(text);
 695             if(m.matches()){
 696                 try {
 697                     accessionNumber = m.group("accNumber");
 698                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 699                 } catch (IllegalArgumentException e){
 700                     // match group acc_number not found
 701                 }
 702             }
 703         }
 704
 705         //2. try it the 'normal' way
 706         if(specimen == null) {
 707             for (Pattern p : specimenTypePatterns) {
 708                 Matcher m = p.matcher(text);
 709                 if (m.matches()) {
 710                     // collection code is mandatory
 711                     try {
 712                         collectionCode = m.group("colCode");
 713                     } catch (IllegalArgumentException e){
 714                         // match group colCode not found
 715                     }
 716                     try {
 717                         subCollectionStr = m.group("subCollection");
 718                     } catch (IllegalArgumentException e){
 719                         // match group subCollection not found
 720                     }
 721                     try {
 722                         instituteStr = m.group("institute");
 723                     } catch (IllegalArgumentException e){
 724                         // match group col_name not found
 725                     }
 726                     try {
 727                         accessionNumber = m.group("accNumber");
 728
 729                         // try to improve the accessionNumber
 730                         if(accessionNumber!= null) {
 731                             accessionNumber = accessionNumber.trim();
 732                             Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
 733                             String betterAccessionNumber = null;
 734                             if (m2.matches()) {
 735                                 try {
 736                                     betterAccessionNumber = m.group("accNumber");
 737                                 } catch (IllegalArgumentException e) {
 738                                     // match group acc_number not found
 739                                 }
 740                             }
 741                             if (betterAccessionNumber != null) {
 742                                 accessionNumber = betterAccessionNumber;
 743                             } else {
 744                                 unusualAccessionNumber = true;
 745                             }
 746                         }
 747
 748                     } catch (IllegalArgumentException e){
 749                         // match group acc_number not found
 750                     }
 751
 752                     if(collectionCode == null && instituteStr == null){
 753                         logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
 754                         continue;
 755                     }
 756                     collection = getCollection(collectionCode, instituteStr, subCollectionStr);
 757                     specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
 758                     break;
 759                 }
 760             }
 761         }
 762         if(specimen == null) {
 763             logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
 764         }
 765         if(unusualAccessionNumber){
 766             logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
 767         }
 768         return specimen;
 769     }
 770
 771     private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
 772
 773         DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
 774         facade.setCollection(collection);
 775         if(accessionNumber != null){
 776             facade.setAccessionNumber(accessionNumber);
 777         }
 778         return facade.innerDerivedUnit();
 779     }
 780
 781     private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
 782                                             String authorStr, String nomRefTitle) {
 783
 784         BotanicalName taxonName;// cache field for the taxonName.titleCache
 785         String taxonNameTitleCache = null;
 786         Map<String, AnnotationType> nameAnnotations = new HashMap<>();
 787
 788         // TitleCache preprocessing
 789         if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
 790             nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
 791             titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 792             if(authorStr != null) {
 793                 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
 794             }
 795         }
 796
 797         // parse the full taxon name
 798         if(!StringUtils.isEmpty(nomRefTitle)){
 799             String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
 800             String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
 801             logger.debug(":::::" + taxonFullNameStr);
 802             taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
 803         } else {
 804             taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
 805         }
 806
 807         taxonNameTitleCache = taxonName.getTitleCache().trim();
 808         if (taxonName.isProtectedTitleCache()) {
 809             logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
 810         } else {
 811
 812             boolean doRestoreTitleCacheStr = false;
 813
 814             // Check if titleCache and nameCache are plausible
 815             String titleCacheCompareStr = titleCacheStr;
 816             String nameCache = taxonName.getNameCache();
 817             String nameCompareStr = nameStr;
 818             if(taxonName.isBinomHybrid()){
 819                 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
 820                 nameCompareStr = nameCompareStr.replace(" x ", " ×");
 821             }
 822             if(taxonName.isMonomHybrid()){
 823                 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
 824                 nameCompareStr = nameCompareStr.replace("^X ", "× ");
 825             }
 826             if(authorStr != null && authorStr.contains(" et ")){
 827                 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
 828             }
 829             if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
 830                 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
 831                 doRestoreTitleCacheStr = true;
 832             }
 833             if (!nameCache.trim().equals(nameCompareStr)) {
 834                 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
 835             }
 836
 837             //  Author
 838             //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
 839             //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
 840             //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
 841             //    doRestoreTitleCacheStr = true;
 842             //}
 843
 844             if(doRestoreTitleCacheStr){
 845                 taxonName.setTitleCache(titleCacheStr, true);
 846             }
 847
 848             // deduplicate
 849             replaceAuthorNamesAndNomRef(state, taxonName);
 850         }
 851
 852         // Annotations
 853         if(!nameAnnotations.isEmpty()){
 854             for(String text : nameAnnotations.keySet()){
 855                 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
 856             }
 857             getNameService().save(taxonName);
 858         }
 859         return taxonName;
 860     }
 861
 862     /**
 863      * @param state
 864      * @return
 865      */
 866     private TaxonNode getClassificationRootNode(IAPTImportState state) {
 867
 868      //   Classification classification = state.getClassification();
 869      //   if (classification == null){
 870      //       IAPTImportConfigurator config = state.getConfig();
 871      //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
 872      //       classification.setUuid(config.getClassificationUuid());
 873      //       classification.setReference(config.getSecReference());
 874      //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
 875      //   }
 876         TaxonNode rootNode = state.getRootNode();
 877         if (rootNode == null){
 878             rootNode = getTaxonNodeService().find(ROOT_UUID);
 879         }
 880         if (rootNode == null){
 881             Classification classification = state.getClassification();
 882             if (classification == null){
 883                 Reference sec = state.getSecReference();
 884                 String classificationName = state.getConfig().getClassificationName();
 885                 Language language = Language.DEFAULT();
 886                 classification = Classification.NewInstance(classificationName, sec, language);
 887                 state.setClassification(classification);
 888                 classification.setUuid(state.getConfig().getClassificationUuid());
 889                 classification.getRootNode().setUuid(ROOT_UUID);
 890                 getClassificationService().save(classification);
 891             }
 892             rootNode = classification.getRootNode();
 893             state.setRootNode(rootNode);
 894         }
 895         return rootNode;
 896     }
 897
 898     private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
 899
 900         Collection superCollection = null;
 901         if(subCollectionStr != null){
 902             superCollection = getCollection(collectionCode, instituteStr, null);
 903             collectionCode = subCollectionStr;
 904             instituteStr = null;
 905         }
 906
 907         final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
 908
 909         Collection collection = collectionMap.get(key);
 910
 911         if(collection == null) {
 912             collection = Collection.NewInstance();
 913             collection.setCode(collectionCode);
 914             if(instituteStr != null){
 915                 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
 916             }
 917             if(superCollection != null){
 918                 collection.setSuperCollection(superCollection);
 919             }
 920             collectionMap.put(key, collection);
 921             getCollectionService().save(collection);
 922         }
 923
 924         return collection;
 925     }
 926
 927
 928     /**
 929      * @param record
 930      * @param originalKey
 931      * @param doUnescapeHtmlEntities
 932      * @return
 933      */
 934     private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
 935         String value = record.get(originalKey);
 936
 937         value = fixCharacters(value);
 938
 939         if (! StringUtils.isBlank(value)) {
 940                 if (logger.isDebugEnabled()) {
 941                     logger.debug(originalKey + ": " + value);
 942                 }
 943                 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
 944             if(doUnescapeHtmlEntities){
 945                 value = StringEscapeUtils.unescapeHtml(value);
 946             }
 947                 return value.trim();
 948         }else{
 949                 return null;
 950         }
 951     }
 952
 953     /**
 954      * Fixes broken characters.
 955      * For details see
 956      * http://dev.e-taxonomy.eu/redmine/issues/6035
 957      *
 958      * @param value
 959      * @return
 960      */
 961     private String fixCharacters(String value) {
 962
 963         value = StringUtils.replace(value, "s$K", "š");
 964         value = StringUtils.replace(value, "n$K", "ň");
 965         value = StringUtils.replace(value, "e$K", "ě");
 966         value = StringUtils.replace(value, "r$K", "ř");
 967         value = StringUtils.replace(value, "c$K", "č");
 968         value = StringUtils.replace(value, "z$K", "ž");
 969         value = StringUtils.replace(value, "S>U$K", "Š");
 970         value = StringUtils.replace(value, "C>U$K", "Č");
 971         value = StringUtils.replace(value, "R>U$K", "Ř");
 972         value = StringUtils.replace(value, "Z>U$K", "Ž");
 973         value = StringUtils.replace(value, "g$K", "ǧ");
 974         value = StringUtils.replace(value, "s$A", "ś");
 975         value = StringUtils.replace(value, "n$A", "ń");
 976         value = StringUtils.replace(value, "c$A", "ć");
 977         value = StringUtils.replace(value, "e$E", "ę");
 978         value = StringUtils.replace(value, "o$H", "õ");
 979         value = StringUtils.replace(value, "s$C", "ş");
 980         value = StringUtils.replace(value, "t$C", "ț");
 981         value = StringUtils.replace(value, "S>U$C", "Ş");
 982         value = StringUtils.replace(value, "a$O", "å");
 983         value = StringUtils.replace(value, "A>U$O", "Å");
 984         value = StringUtils.replace(value, "u$O", "ů");
 985         value = StringUtils.replace(value, "g$B", "ğ");
 986         value = StringUtils.replace(value, "g$B", "ĕ");
 987         value = StringUtils.replace(value, "a$B", "ă");
 988         value = StringUtils.replace(value, "l$/", "ł");
 989         value = StringUtils.replace(value, ">i", "ı");
 990         value = StringUtils.replace(value, "i$U", "ï");
 991         // Special-cases
 992         value = StringUtils.replace(value, "&yacute", "ý");
 993         value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
 994         value = StringUtils.replace(value, "E>U$D", "З");
 995         value = StringUtils.replace(value, "S>U$E", "Ş");
 996         value = StringUtils.replace(value, "s$E", "ş");
 997
 998         value = StringUtils.replace(value, "c$k", "č");
 999         value = StringUtils.replace(value, " U$K", " Š");
1000
1001         return value;
1002     }
1003
1004
1005     /**
1006          *  Stores taxa records in DB
1007          */
1008         @Override
1009     protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1010
1011         String lineNumber = "L#" + state.getCurrentLine() + ": ";
1012         logger.setLevel(Level.DEBUG);
1013         HashMap<String, String> record = state.getOriginalRecord();
1014         logger.debug(lineNumber + record.toString());
1015
1016         Set<String> keys = record.keySet();
1017         for (String key: keys) {
1018             if (! expectedKeys.contains(key)){
1019                 logger.warn(lineNumber + "Unexpected Key: " + key);
1020             }
1021         }
1022
1023         String reg_id = record.get(REGISTRATIONNO_PK);
1024
1025         //higherTaxon
1026         String higherTaxaString = record.get(HIGHERTAXON);
1027         boolean isFossil = false;
1028         if(higherTaxaString.startsWith("FOSSIL ")){
1029             higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1030             isFossil = true;
1031         }
1032         TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1033
1034        //Taxon
1035         Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1036         if (taxon == null){
1037             logger.warn(lineNumber + "taxon could not be created and is null");
1038             return;
1039         }
1040         ((IAPTImportState)state).setCurrentTaxon(taxon);
1041
1042
1043                 return;
1044     }
1045
1046     private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1047         String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1048         TaxonNode higherTaxonNode = null;
1049
1050         ITaxonTreeNode rootNode = getClassificationRootNode(state);
1051         for (String htn :  higherTaxaNames) {
1052             htn = StringUtils.capitalize(htn.trim());
1053             Taxon higherTaxon = state.getHigherTaxon(htn);
1054             if (higherTaxon != null){
1055                 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1056             }else{
1057                 BotanicalName name = makeHigherTaxonName(state, htn);
1058                 Reference sec = state.getSecReference();
1059                 higherTaxon = Taxon.NewInstance(name, sec);
1060                 getTaxonService().save(higherTaxon);
1061                 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1062                 state.putHigherTaxon(htn, higherTaxon);
1063                 getClassificationService().saveTreeNode(higherTaxonNode);
1064             }
1065             rootNode = higherTaxonNode;
1066         }
1067         return higherTaxonNode;
1068     }
1069
1070     private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1071
1072         Rank rank = guessRank(name);
1073
1074         BotanicalName taxonName = BotanicalName.NewInstance(rank);
1075         taxonName.addSource(makeOriginalSource(state));
1076         taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1077         return taxonName;
1078     }
1079
1080     private Rank guessRank(String name) {
1081
1082         // normalize
1083         name = name.replaceAll("\\(.*\\)", "").trim();
1084
1085         if(name.matches("^Plantae$|^Fungi$")){
1086            return Rank.KINGDOM();
1087         } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1088            return rankFamilyIncertisSedis();
1089         } else if(name.matches(".*phyta$|.*mycota$")){
1090            return Rank.PHYLUM();
1091         } else if(name.matches(".*phytina$|.*mycotina$")){
1092            return Rank.SUBPHYLUM();
1093         } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1094             return rankUnrankedSupraGeneric();
1095         } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1096            return Rank.CLASS();
1097         } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1098            return Rank.SUBCLASS();
1099         } else if(name.matches(".*ales$")){
1100            return Rank.ORDER();
1101         } else if(name.matches(".*ineae$")){
1102            return Rank.SUBORDER();
1103         } else if(name.matches(".*aceae$")){
1104             return Rank.FAMILY();
1105         } else if(name.matches(".*oideae$")){
1106            return Rank.SUBFAMILY();
1107         } else
1108         //    if(name.matches(".*eae$")){
1109         //    return Rank.TRIBE();
1110         // } else
1111             if(name.matches(".*inae$")){
1112            return Rank.SUBTRIBE();
1113         } else if(name.matches(".*ae$")){
1114            return Rank.FAMILY();
1115         }
1116         return Rank.UNKNOWN_RANK();
1117     }
1118
1119     private Rank rankUnrankedSupraGeneric() {
1120
1121         if(rankUnrankedSupraGeneric == null){
1122             rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1123             getTermService().save(rankUnrankedSupraGeneric);
1124         }
1125         return rankUnrankedSupraGeneric;
1126     }
1127
1128     private Rank rankFamilyIncertisSedis() {
1129
1130         if(familyIncertisSedis == null){
1131             familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1132             getTermService().save(familyIncertisSedis);
1133         }
1134         return familyIncertisSedis;
1135     }
1136
1137     private AnnotationType annotationTypeCaveats(){
1138         if(annotationTypeCaveats == null){
1139             annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1140             getTermService().save(annotationTypeCaveats);
1141         }
1142         return annotationTypeCaveats;
1143     }
1144
1145
1146     /**
1147      * @param state
1148      * @return
1149      */
1150     private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1151         return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1152     }
1153
1154
1155     private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1156         Reference ref = state.getReference(uuidRef);
1157         if (ref == null){
1158             ref = getReferenceService().find(uuidRef);
1159             state.putReference(uuidRef, ref);
1160         }
1161         return ref;
1162     }
1163
1164     private MarkerType markerTypeFossil(){
1165         if(this.markerTypeFossil == null){
1166             markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1167             getTermService().save(this.markerTypeFossil);
1168         }
1169         return markerTypeFossil;
1170     }
1171
1172     private String csvReportLine(String regId, String message, String ... fields){
1173         StringBuilder out = new StringBuilder("regID#");
1174         out.append(regId).append(",\"").append(message).append('"');
1175
1176         for(String f : fields){
1177             out.append(",\"").append(f).append('"');
1178         }
1179         return out.toString();
1180     }
1181
1182
1183 }