Project

General

Profile

Download (52.9 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13
import eu.etaxonomy.cdm.api.service.pager.Pager;
14
import eu.etaxonomy.cdm.common.CdmUtils;
15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
16
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
17
import eu.etaxonomy.cdm.model.agent.Institution;
18
import eu.etaxonomy.cdm.model.agent.Person;
19
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
20
import eu.etaxonomy.cdm.model.common.*;
21
import eu.etaxonomy.cdm.model.name.*;
22
import eu.etaxonomy.cdm.model.occurrence.*;
23
import eu.etaxonomy.cdm.model.occurrence.Collection;
24
import eu.etaxonomy.cdm.model.reference.Reference;
25
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
26
import eu.etaxonomy.cdm.model.reference.ReferenceType;
27
import eu.etaxonomy.cdm.model.taxon.*;
28
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
29
import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
30
import org.apache.commons.lang.ArrayUtils;
31
import org.apache.commons.lang.StringEscapeUtils;
32
import org.apache.commons.lang.StringUtils;
33
import org.apache.log4j.Level;
34
import org.apache.log4j.Logger;
35
import org.joda.time.DateTimeFieldType;
36
import org.joda.time.Partial;
37
import org.joda.time.format.DateTimeFormat;
38
import org.joda.time.format.DateTimeFormatter;
39
import org.springframework.stereotype.Component;
40

    
41
import java.util.*;
42
import java.util.regex.Matcher;
43
import java.util.regex.Pattern;
44

    
45
/**
46
 * @author a.mueller
47
 * @created 05.01.2016
48
 */
49

    
50
@Component("iAPTExcelImport")
51
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
52
    private static final long serialVersionUID = -747486709409732371L;
53
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
54
    public static final String ANNOTATION_MARKER_STRING = "[*]";
55

    
56

    
57
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
58

    
59
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
60

    
61
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
62
    private final static String HIGHERTAXON= "HigherTaxon";
63
    private final static String FULLNAME= "FullName";
64
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
65
    private final static String LITSTRING= "LitString";
66
    private final static String REGISTRATION= "Registration";
67
    private final static String TYPE= "Type";
68
    private final static String CAVEATS= "Caveats";
69
    private final static String FULLBASIONYM= "FullBasionym";
70
    private final static String FULLSYNSUBST= "FullSynSubst";
71
    private final static String NOTESTXT= "NotesTxt";
72
    private final static String REGDATE= "RegDate";
73
    private final static String NAMESTRING= "NameString";
74
    private final static String BASIONYMSTRING= "BasionymString";
75
    private final static String SYNSUBSTSTR= "SynSubstStr";
76
    private final static String AUTHORSTRING= "AuthorString";
77

    
78
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
79
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
80

    
81
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
82
    private static final Pattern[] datePatterns = new Pattern[]{
83
            // NOTE:
84
            // The order of the patterns is extremely important!!!
85
            //
86
            // all patterns cover the years 1700 - 1999
87
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
88
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
89
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
90
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
91
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
92
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
93
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
94
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
95
            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
96
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
97
        };
98
    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
99

    
100
    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
101
    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
102
    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
103

    
104
    private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
105
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
106
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
107

    
108
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
109
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
110

    
111
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
112
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
113
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
114
            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
115
            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
116
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
117
    };
118

    
119
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
120

    
121
    static {
122
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
123
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
124
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
125
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
126
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
127
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
128
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
129
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
130
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
131
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
132

    
133
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
134

    
135
        for (String[] months: perLang) {
136
            for(int m = 1; m < 13; m++){
137
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
138
            }
139
        }
140

    
141
        // special cases
142
        monthFromNameMap.put("mar", 3);
143
        monthFromNameMap.put("dec", 12);
144
        monthFromNameMap.put("Februari", 2);
145
    }
146

    
147

    
148
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
149

    
150
    private Map<String, Collection> collectionMap = new HashMap<>();
151

    
152

    
153
    enum TypesName {
154
        fieldUnit, holotype, isotype;
155

    
156
        public SpecimenTypeDesignationStatus status(){
157
            switch (this) {
158
                case holotype:
159
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
160
                case isotype:
161
                    return SpecimenTypeDesignationStatus.ISOTYPE();
162
                default:
163
                    return null;
164
            }
165
        }
166
    }
167

    
168
    private MarkerType markerTypeFossil = null;
169
    private Rank rankUnrankedSupraGeneric = null;
170
    private Rank familyIncertisSedis = null;
171
    private AnnotationType annotationTypeCaveats = null;
172

    
173
    private Reference bookVariedadesTradicionales = null;
174

    
175
    /**
176
     * HACK for unit simple testing
177
     */
178
    boolean _testMode = System.getProperty("TEST_MODE") != null;
179

    
180
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
181
                            TaxonNode higherTaxonNode, boolean isFossil) {
182

    
183
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
184
        String regStr = getValue(record, REGISTRATION, true);
185
        String titleCacheStr = getValue(record, FULLNAME, true);
186
        String nameStr = getValue(record, NAMESTRING, true);
187
        String authorStr = getValue(record, AUTHORSTRING, true);
188
        String nomRefStr = getValue(record, LITSTRING, true);
189
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
190
        String notesTxt = getValue(record, NOTESTXT, true);
191
        String caveats = getValue(record, CAVEATS, true);
192
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
193
        String fullBasionymStr = getValue(record, FULLBASIONYM, true);
194
        String basionymNameStr = getValue(record, FULLBASIONYM, true);
195
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
196
        String typeStr = getValue(record, TYPE, true);
197

    
198

    
199
        String nomRefTitle = null;
200
        String nomRefDetail;
201
        String nomRefPupDate = null;
202
        String nomRefIssue = null;
203
        Partial pupDate = null;
204

    
205
        boolean restoreOriginalReference = false;
206
        boolean nameIsValid = true;
207

    
208
        // preprocess nomRef: separate citation, reference detail, publishing date
209
        if(!StringUtils.isEmpty(nomRefStr)){
210
            nomRefStr = nomRefStr.trim();
211

    
212
            // handle the special case which is hard to parse:
213
            //
214
            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
215
            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
216

    
217
                if(bookVariedadesTradicionales == null){
218
                    bookVariedadesTradicionales = ReferenceFactory.newBook();
219
                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
220
                    bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
221
                    getReferenceService().save(bookVariedadesTradicionales);
222
                }
223
                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
224
                restoreOriginalReference = true;
225
            }
226

    
227
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
228
            if(m.matches()){
229
                nomRefTitle = m.group("title");
230
                nomRefDetail = m.group("detail");
231
                nomRefPupDate = m.group("date").trim();
232
                nomRefIssue = m.group("issue");
233

    
234
                pupDate = parseDate(regNumber, nomRefPupDate);
235
                if (pupDate != null) {
236
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
237
                } else {
238
                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
239
                }
240
            } else {
241
                nomRefTitle = nomRefStr;
242
            }
243
        }
244

    
245
        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
246

    
247
        // always add the original strings of parsed data as annotation
248
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
249
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
250
                        "\n -  '" + TYPE + "': " + typeStr +
251
                        "\n -  '" + REGISTRATION  + "': " + regStr
252
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
253

    
254
        if(restoreOriginalReference){
255
            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
256
        }
257
        if(pupDate != null) {
258
            taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
259
        }
260
        if(nomRefIssue != null) {
261
            ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
262
        }
263

    
264

    
265
        if(!StringUtils.isEmpty(notesTxt)){
266
            notesTxt = notesTxt.replace("Notes: ", "").trim();
267
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
268
            nameIsValid = false;
269

    
270
        }
271
        if(!StringUtils.isEmpty(caveats)){
272
            caveats = caveats.replace("Caveats: ", "").trim();
273
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
274
            nameIsValid = false;
275
        }
276

    
277
        if(nameIsValid){
278
            // Status is always considered valid if no notes and cavets are set
279
            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
280
        }
281

    
282
        getNameService().save(taxonName);
283

    
284
        // Namerelations
285
        if(!StringUtils.isEmpty(authorsSpelling)){
286
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
287

    
288
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
289
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
290

    
291
            ArrayUtils.reverse(authorSpellingTokens);
292
            ArrayUtils.reverse(nameStrTokens);
293

    
294
            for (int i = 0; i < nameStrTokens.length; i++){
295
                if(i < authorSpellingTokens.length){
296
                    nameStrTokens[i] = authorSpellingTokens[i];
297
                }
298
            }
299
            ArrayUtils.reverse(nameStrTokens);
300

    
301
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
302
            // build the fullnameString of the misspelled name
303
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
304

    
305
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
306
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
307
            getNameService().save(misspelledName);
308
        }
309

    
310
        // Replaced Synonyms
311
        if(!StringUtils.isEmpty(fullSynSubstStr)){
312
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
313
            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
314
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
315
            getNameService().save(replacedSynonymName);
316
        }
317

    
318
        Reference sec = state.getConfig().getSecReference();
319
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
320

    
321
        // Basionym
322
        if(fullBasionymStr != null){
323
            fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
324
            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
325
            BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
326
            getNameService().save(basionym);
327
            taxonName.addBasionym(basionym);
328

    
329
            Synonym syn = Synonym.NewInstance(basionym, sec);
330
            taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
331
            getTaxonService().save(syn);
332
        }
333

    
334
        // Markers
335
        if(isFossil){
336
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
337
        }
338

    
339
        // Types
340
        if(!StringUtils.isEmpty(typeStr)){
341

    
342
            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
343
                makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
344
            } else {
345
                makeNameTypeData(typeStr, taxonName, regNumber, state);
346
            }
347
        }
348

    
349
        getTaxonService().save(taxon);
350

    
351
        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
352
            // try to find the genus, it should have been imported already, Genera are coming first in the import file
353
            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
354
            if(genus != null){
355
                higherTaxonNode = genus.getTaxonNodes().iterator().next();
356
            } else {
357
                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
358
            }
359
        }
360

    
361
        if(higherTaxonNode != null){
362
            higherTaxonNode.addChildTaxon(taxon, null, null);
363
            getTaxonNodeService().save(higherTaxonNode);
364
        }
365

    
366
        if(taxonName.getRank().isGenus()){
367
            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
368
        }
369

    
370
        return taxon;
371
    }
372

    
373
    private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
374

    
375
        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
376

    
377
        if(m.matches()){
378
            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
379
            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
380
            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
381
            if(fieldUnit == null) {
382
                // create a field unit with only a titleCache using the fieldUnitStr substring
383
                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
384
                fieldUnit = FieldUnit.NewInstance();
385
                fieldUnit.setTitleCache(fieldUnitStr, true);
386
                getOccurrenceService().save(fieldUnit);
387
            }
388
            getOccurrenceService().save(fieldUnit);
389

    
390
            // all others ..
391
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
392
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
393

    
394
        } else {
395
            // create a field unit with only a titleCache using the full typeStr
396
            FieldUnit fieldUnit = FieldUnit.NewInstance();
397
            fieldUnit.setTitleCache(typeStr, true);
398
            getOccurrenceService().save(fieldUnit);
399
            logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
400
        }
401
        getNameService().save(taxonName);
402
    }
403

    
404
    private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
405

    
406
        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
407
        if(nameStr.isEmpty()) {
408
            return;
409
        }
410

    
411
        String basionymNameStr = null;
412
        String noteStr = null;
413
        String agentStr = null;
414

    
415
        Matcher m;
416

    
417
        if(typeStr.startsWith("not to be indicated")){
418
            // Special case:
419
            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
420
            // FIXME
421
            m = typeNameSpecialSplitPattern.matcher(nameStr);
422
            if(m.matches()){
423
                nameStr = m.group("name");
424
                noteStr = m.group("note");
425
                agentStr = m.group("agent");
426
                // TODO better import of agent?
427
                if(agentStr != null){
428
                    noteStr = noteStr + ": " + agentStr;
429
                }
430
            }
431
        } else {
432
            // Generic case
433
            m = typeNameBasionymPattern.matcher(nameStr);
434
            if (m.find()) {
435
                basionymNameStr = m.group("basionymName");
436
                if (basionymNameStr != null) {
437
                    nameStr = nameStr.replace(m.group(0), "");
438
                }
439
            }
440

    
441
            m = typeNameNotePattern.matcher(nameStr);
442
            if (m.find()) {
443
                noteStr = m.group(1);
444
                if (noteStr != null) {
445
                    nameStr = nameStr.replace(m.group(0), "");
446
                }
447
            }
448
        }
449

    
450
        BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
451

    
452
        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
453
            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
454
        }
455

    
456
        if(basionymNameStr != null){
457
            BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
458
            getNameService().save(basionymName);
459
            typeName.addBasionym(basionymName);
460
        }
461

    
462

    
463
        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
464
        nameTypeDesignation.setTypeName(typeName);
465
        getNameService().save(typeName);
466

    
467
        if(noteStr != null){
468
            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
469
        }
470
        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
471

    
472
    }
473

    
474
    /**
475
     * Currently only parses the collector, fieldNumber and the collection date.
476
     *
477
     * @param fieldUnitStr
478
     * @param regNumber
479
     * @param state
480
     * @return null if the fieldUnitStr could not be parsed
481
     */
482
    private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
483

    
484
        FieldUnit fieldUnit = null;
485

    
486
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
487
        if(m1.matches()){
488

    
489
            String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
490
            String removal = m1.group(1);
491
            if(collectorData == null){
492
                collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
493
                removal = m1.group(3);
494
            }
495
            if(collectorData == null){
496
                return null;
497
            }
498

    
499
            // the fieldUnitStr is parsable
500
            // remove all collectorData from the fieldUnitStr and use the rest as locality
501
            String locality = fieldUnitStr.replace(removal, "");
502

    
503
            String collectorStr = null;
504
            String detailStr = null;
505
            Partial date = null;
506
            String fieldNumber = null;
507

    
508
            Matcher m2 = collectionDataPattern.matcher(collectorData);
509
            if(m2.matches()){
510
                collectorStr = m2.group("collector");
511
                detailStr = m2.group("detail");
512

    
513
                // Try to make sense of the detailStr
514
                if(detailStr != null){
515
                    detailStr = detailStr.trim();
516
                    // 1. try to parse as date
517
                    date = parseDate(regNumber, detailStr);
518
                    if(date == null){
519
                        // 2. try to parse as number
520
                        if(collectorsNumber.matcher(detailStr).matches()){
521
                            fieldNumber = detailStr;
522
                        }
523
                    }
524
                }
525
                if(date == null && fieldNumber == null){
526
                    // detailed parsing not possible, so need fo fallback
527
                    collectorStr = collectorData;
528
                }
529
            }
530

    
531
            if(collectorStr == null) {
532
                collectorStr = collectorData;
533
            }
534

    
535
            fieldUnit = FieldUnit.NewInstance();
536
            GatheringEvent ge = GatheringEvent.NewInstance();
537
            ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
538

    
539
            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
540
            if(agent == null) {
541
                agent = Person.NewTitledInstance(collectorStr);
542
                getAgentService().save(agent);
543
                state.putAgentBase(collectorStr, agent);
544
            }
545
            ge.setCollector(agent);
546

    
547
            if(date != null){
548
                ge.setGatheringDate(date);
549
            }
550

    
551
            getEventBaseService().save(ge);
552
            fieldUnit.setGatheringEvent(ge);
553

    
554
            if(fieldNumber != null) {
555
                fieldUnit.setFieldNumber(fieldNumber);
556
            }
557
            getOccurrenceService().save(fieldUnit);
558

    
559
        }
560

    
561
        return fieldUnit;
562
    }
563

    
564
    protected Partial parseDate(String regNumber, String dateStr) {
565

    
566
        Partial pupDate = null;
567
        boolean parseError = false;
568

    
569
        String day = null;
570
        String month = null;
571
        String monthName = null;
572
        String year = null;
573

    
574
        for(Pattern p : datePatterns){
575
            Matcher m2 = p.matcher(dateStr);
576
            if(m2.matches()){
577
                try {
578
                    year = m2.group("year");
579
                } catch (IllegalArgumentException e){
580
                    // named capture group not found
581
                }
582
                try {
583
                    month = m2.group("month");
584
                } catch (IllegalArgumentException e){
585
                    // named capture group not found
586
                }
587

    
588
                try {
589
                    monthName = m2.group("monthName");
590
                    month = monthFromName(monthName, regNumber);
591
                    if(month == null){
592
                        parseError = true;
593
                    }
594
                } catch (IllegalArgumentException e){
595
                    // named capture group not found
596
                }
597
                try {
598
                    day = m2.group("day");
599
                } catch (IllegalArgumentException e){
600
                    // named capture group not found
601
                }
602

    
603
                if(year != null){
604
                    if (year.length() == 2) {
605
                        // it is an abbreviated year from the 19** years
606
                        year = "19" + year;
607
                    }
608
                    break;
609
                } else {
610
                    parseError = true;
611
                }
612
            }
613
        }
614
        if(year == null){
615
            parseError = true;
616
        }
617
        List<DateTimeFieldType> types = new ArrayList<>();
618
        List<Integer> values = new ArrayList<>();
619
        if(!parseError) {
620
            types.add(DateTimeFieldType.year());
621
            values.add(Integer.parseInt(year));
622
            if (month != null) {
623
                types.add(DateTimeFieldType.monthOfYear());
624
                values.add(Integer.parseInt(month));
625
            }
626
            if (day != null) {
627
                types.add(DateTimeFieldType.dayOfMonth());
628
                values.add(Integer.parseInt(day));
629
            }
630
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
631
        }
632
        return pupDate;
633
    }
634

    
635
    private String monthFromName(String monthName, String regNumber) {
636

    
637
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
638
        if(month == null){
639
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
640
            return null;
641
        } else {
642
            return month.toString();
643
        }
644
    }
645

    
646

    
647
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
648

    
649
        if(StringUtils.isEmpty(typeStr)){
650
            return;
651
        }
652
        typeStr = typeStr.trim().replaceAll("\\.$", "");
653

    
654
        Collection collection = null;
655
        DerivedUnit specimen = null;
656

    
657
        List<DerivedUnit> specimens = new ArrayList<>();
658
        if(multiple){
659
            String[] tokens = typeStr.split("\\s?,\\s?");
660
            for (String t : tokens) {
661
                // command to  list all complex parsabel types:
662
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
663
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
664

    
665
                if(!t.isEmpty()){
666
                    // trying to parse the string
667
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
668
                    if(specimen != null){
669
                        specimens.add(specimen);
670
                    } else {
671
                        // parsing was not successful make simple specimen
672
                        specimens.add(makeSpecimenType(fieldUnit, t));
673
                    }
674
                }
675
            }
676
        } else {
677
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
678
            if(specimen != null) {
679
                specimens.add(specimen);
680
                // remember current collection
681
                collection = specimen.getCollection();
682
            } else {
683
                // parsing was not successful make simple specimen
684
                specimens.add(makeSpecimenType(fieldUnit, typeStr));
685
            }
686
        }
687

    
688
        for(DerivedUnit s : specimens){
689
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
690
       }
691
    }
692

    
693
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
694
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
695
        facade.setTitleCache(titleCache.trim(), true);
696
        specimen = facade.innerDerivedUnit();
697
        return specimen;
698
    }
699

    
700
    /**
701
     *
702
     * @param fieldUnit
703
     * @param typeName
704
     * @param collection
705
     * @param text
706
     * @param regNumber
707
     * @return
708
     */
709
    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
710

    
711
        DerivedUnit specimen = null;
712

    
713
        String collectionCode = null;
714
        String collectionTitle = null;
715
        String subCollectionStr = null;
716
        String instituteStr = null;
717
        String accessionNumber = null;
718

    
719
        boolean unusualAccessionNumber = false;
720

    
721
        text = text.trim();
722

    
723
        // 1.  For Isotypes often the accession number is noted alone if the
724
        //     preceeding entry has a collection code.
725
        if(typeName .equals(TypesName.isotype) && collection != null){
726
            Matcher m = accessionNumberOnlyPattern.matcher(text);
727
            if(m.matches()){
728
                try {
729
                    accessionNumber = m.group("accNumber");
730
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
731
                } catch (IllegalArgumentException e){
732
                    // match group acc_number not found
733
                }
734
            }
735
        }
736

    
737
        //2. try it the 'normal' way
738
        if(specimen == null) {
739
            for (Pattern p : specimenTypePatterns) {
740
                Matcher m = p.matcher(text);
741
                if (m.matches()) {
742
                    // collection code or collectionTitle is mandatory
743
                    try {
744
                        collectionCode = m.group("colCode");
745
                    } catch (IllegalArgumentException e){
746
                        // match group colCode not found
747
                    }
748

    
749
                    try {
750
                        instituteStr = m.group("institute");
751
                    } catch (IllegalArgumentException e){
752
                        // match group col_name not found
753
                    }
754

    
755
                    try {
756
                        subCollectionStr = m.group("subCollection");
757
                    } catch (IllegalArgumentException e){
758
                        // match group subCollection not found
759
                    }
760
                    try {
761
                        accessionNumber = m.group("accNumber");
762

    
763
                        // try to improve the accessionNumber
764
                        if(accessionNumber!= null) {
765
                            accessionNumber = accessionNumber.trim();
766
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
767
                            String betterAccessionNumber = null;
768
                            if (m2.matches()) {
769
                                try {
770
                                    betterAccessionNumber = m.group("accNumber");
771
                                } catch (IllegalArgumentException e) {
772
                                    // match group acc_number not found
773
                                }
774
                            }
775
                            if (betterAccessionNumber != null) {
776
                                accessionNumber = betterAccessionNumber;
777
                            } else {
778
                                unusualAccessionNumber = true;
779
                            }
780
                        }
781

    
782
                    } catch (IllegalArgumentException e){
783
                        // match group acc_number not found
784
                    }
785

    
786
                    if(collectionCode == null && instituteStr == null){
787
                        logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
788
                        continue;
789
                    }
790
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
791
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
792
                    break;
793
                }
794
            }
795
        }
796
        if(specimen == null) {
797
            logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
798
        }
799
        if(unusualAccessionNumber){
800
            logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
801
        }
802
        return specimen;
803
    }
804

    
805
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
806

    
807
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
808
        facade.setCollection(collection);
809
        if(accessionNumber != null){
810
            facade.setAccessionNumber(accessionNumber);
811
        }
812
        return facade.innerDerivedUnit();
813
    }
814

    
815
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
816
                                            String authorStr, String nomRefTitle) {
817

    
818
        BotanicalName taxonName;// cache field for the taxonName.titleCache
819
        String taxonNameTitleCache = null;
820
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
821

    
822
        // TitleCache preprocessing
823
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
824
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
825
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
826
            if(authorStr != null) {
827
                authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
828
            }
829
        }
830

    
831
        // parse the full taxon name
832
        if(!StringUtils.isEmpty(nomRefTitle)){
833
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
834
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
835
            logger.debug(":::::" + taxonFullNameStr);
836
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
837
        } else {
838
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
839
        }
840

    
841
        taxonNameTitleCache = taxonName.getTitleCache().trim();
842
        if (taxonName.isProtectedTitleCache()) {
843
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
844
        } else {
845

    
846
            boolean doRestoreTitleCacheStr = false;
847

    
848
            // Check if titleCache and nameCache are plausible
849
            String titleCacheCompareStr = titleCacheStr;
850
            String nameCache = taxonName.getNameCache();
851
            String nameCompareStr = nameStr;
852
            if(taxonName.isBinomHybrid()){
853
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
854
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
855
            }
856
            if(taxonName.isMonomHybrid()){
857
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
858
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
859
            }
860
            if(authorStr != null && authorStr.contains(" et ")){
861
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
862
            }
863
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
864
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
865
                doRestoreTitleCacheStr = true;
866
            }
867
            if (!nameCache.trim().equals(nameCompareStr)) {
868
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
869
            }
870

    
871
            //  Author
872
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
873
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
874
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
875
            //    doRestoreTitleCacheStr = true;
876
            //}
877

    
878
            if(doRestoreTitleCacheStr){
879
                taxonName.setTitleCache(titleCacheStr, true);
880
            }
881

    
882
            // deduplicate
883
            replaceAuthorNamesAndNomRef(state, taxonName);
884
        }
885

    
886
        // Annotations
887
        if(!nameAnnotations.isEmpty()){
888
            for(String text : nameAnnotations.keySet()){
889
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
890
            }
891
        }
892

    
893
        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
894

    
895
        getNameService().save(taxonName);
896

    
897
        return taxonName;
898
    }
899

    
900
    /**
901
     * @param state
902
     * @return
903
     */
904
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
905

    
906
     //   Classification classification = state.getClassification();
907
     //   if (classification == null){
908
     //       IAPTImportConfigurator config = state.getConfig();
909
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
910
     //       classification.setUuid(config.getClassificationUuid());
911
     //       classification.setReference(config.getSecReference());
912
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
913
     //   }
914
        TaxonNode rootNode = state.getRootNode();
915
        if (rootNode == null){
916
            rootNode = getTaxonNodeService().find(ROOT_UUID);
917
        }
918
        if (rootNode == null){
919
            Classification classification = state.getClassification();
920
            if (classification == null){
921
                Reference sec = state.getSecReference();
922
                String classificationName = state.getConfig().getClassificationName();
923
                Language language = Language.DEFAULT();
924
                classification = Classification.NewInstance(classificationName, sec, language);
925
                state.setClassification(classification);
926
                classification.setUuid(state.getConfig().getClassificationUuid());
927
                classification.getRootNode().setUuid(ROOT_UUID);
928
                getClassificationService().save(classification);
929
            }
930
            rootNode = classification.getRootNode();
931
            state.setRootNode(rootNode);
932
        }
933
        return rootNode;
934
    }
935

    
936
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
937

    
938
        Collection superCollection = null;
939
        if(subCollectionStr != null){
940
            superCollection = getCollection(collectionCode, instituteStr, null);
941
            collectionCode = subCollectionStr;
942
            instituteStr = null;
943
        }
944

    
945
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
946

    
947
        Collection collection = collectionMap.get(key);
948

    
949
        if(collection == null) {
950
            collection = Collection.NewInstance();
951
            collection.setCode(collectionCode);
952
            if(instituteStr != null){
953
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
954
            }
955
            if(superCollection != null){
956
                collection.setSuperCollection(superCollection);
957
            }
958
            collectionMap.put(key, collection);
959
            if(!_testMode) {
960
                getCollectionService().save(collection);
961
            }
962
        }
963

    
964
        return collection;
965
    }
966

    
967

    
968
    /**
969
     * @param record
970
     * @param originalKey
971
     * @param doUnescapeHtmlEntities
972
     * @return
973
     */
974
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
975
        String value = record.get(originalKey);
976

    
977
        value = fixCharacters(value);
978

    
979
        if (! StringUtils.isBlank(value)) {
980
        	if (logger.isDebugEnabled()) {
981
        	    logger.debug(originalKey + ": " + value);
982
        	}
983
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
984
            if(doUnescapeHtmlEntities){
985
                value = StringEscapeUtils.unescapeHtml(value);
986
            }
987
        	return value.trim();
988
        }else{
989
        	return null;
990
        }
991
    }
992

    
993
    /**
994
     * Fixes broken characters.
995
     * For details see
996
     * http://dev.e-taxonomy.eu/redmine/issues/6035
997
     *
998
     * @param value
999
     * @return
1000
     */
1001
    private String fixCharacters(String value) {
1002

    
1003
        value = StringUtils.replace(value, "s$K", "š");
1004
        value = StringUtils.replace(value, "n$K", "ň");
1005
        value = StringUtils.replace(value, "e$K", "ě");
1006
        value = StringUtils.replace(value, "r$K", "ř");
1007
        value = StringUtils.replace(value, "c$K", "č");
1008
        value = StringUtils.replace(value, "z$K", "ž");
1009
        value = StringUtils.replace(value, "S>U$K", "Š");
1010
        value = StringUtils.replace(value, "C>U$K", "Č");
1011
        value = StringUtils.replace(value, "R>U$K", "Ř");
1012
        value = StringUtils.replace(value, "Z>U$K", "Ž");
1013
        value = StringUtils.replace(value, "g$K", "ǧ");
1014
        value = StringUtils.replace(value, "s$A", "ś");
1015
        value = StringUtils.replace(value, "n$A", "ń");
1016
        value = StringUtils.replace(value, "c$A", "ć");
1017
        value = StringUtils.replace(value, "e$E", "ę");
1018
        value = StringUtils.replace(value, "o$H", "õ");
1019
        value = StringUtils.replace(value, "s$C", "ş");
1020
        value = StringUtils.replace(value, "t$C", "ț");
1021
        value = StringUtils.replace(value, "S>U$C", "Ş");
1022
        value = StringUtils.replace(value, "a$O", "å");
1023
        value = StringUtils.replace(value, "A>U$O", "Å");
1024
        value = StringUtils.replace(value, "u$O", "ů");
1025
        value = StringUtils.replace(value, "g$B", "ğ");
1026
        value = StringUtils.replace(value, "g$B", "ĕ");
1027
        value = StringUtils.replace(value, "a$B", "ă");
1028
        value = StringUtils.replace(value, "l$/", "ł");
1029
        value = StringUtils.replace(value, ">i", "ı");
1030
        value = StringUtils.replace(value, "i$U", "ï");
1031
        // Special-cases
1032
        value = StringUtils.replace(value, "&yacute", "ý");
1033
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1034
        value = StringUtils.replace(value, "E>U$D", "З");
1035
        value = StringUtils.replace(value, "S>U$E", "Ş");
1036
        value = StringUtils.replace(value, "s$E", "ş");
1037

    
1038
        value = StringUtils.replace(value, "c$k", "č");
1039
        value = StringUtils.replace(value, " U$K", " Š");
1040

    
1041
        value = StringUtils.replace(value, "O>U>!", "Ø");
1042
        value = StringUtils.replace(value, "o>!", "ø");
1043
        value = StringUtils.replace(value, "S$K", "Ŝ");
1044
        value = StringUtils.replace(value, ">l", "ğ");
1045

    
1046
        value = StringUtils.replace(value, "§B>i", "ł");
1047

    
1048

    
1049

    
1050
        return value;
1051
    }
1052

    
1053

    
1054
    /**
1055
	 *  Stores taxa records in DB
1056
	 */
1057
	@Override
1058
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1059

    
1060
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
1061
        logger.setLevel(Level.DEBUG);
1062
        HashMap<String, String> record = state.getOriginalRecord();
1063
        logger.debug(lineNumber + record.toString());
1064

    
1065
        Set<String> keys = record.keySet();
1066
        for (String key: keys) {
1067
            if (! expectedKeys.contains(key)){
1068
                logger.warn(lineNumber + "Unexpected Key: " + key);
1069
            }
1070
        }
1071

    
1072
        String reg_id = record.get(REGISTRATIONNO_PK);
1073

    
1074
        //higherTaxon
1075
        String higherTaxaString = record.get(HIGHERTAXON);
1076
        boolean isFossil = false;
1077
        if(higherTaxaString.startsWith("FOSSIL ")){
1078
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1079
            isFossil = true;
1080
        }
1081
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1082

    
1083
       //Taxon
1084
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1085
        if (taxon == null){
1086
            logger.warn(lineNumber + "taxon could not be created and is null");
1087
            return;
1088
        }
1089
        ((IAPTImportState)state).setCurrentTaxon(taxon);
1090

    
1091

    
1092
        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1093
		return;
1094
    }
1095

    
1096
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1097
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1098
        TaxonNode higherTaxonNode = null;
1099

    
1100
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
1101
        for (String htn :  higherTaxaNames) {
1102
            htn = StringUtils.capitalize(htn.trim());
1103
            Taxon higherTaxon = state.getHigherTaxon(htn);
1104
            if (higherTaxon != null){
1105
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1106
            }else{
1107
                BotanicalName name = makeHigherTaxonName(state, htn);
1108
                Reference sec = state.getSecReference();
1109
                higherTaxon = Taxon.NewInstance(name, sec);
1110
                getTaxonService().save(higherTaxon);
1111
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1112
                state.putHigherTaxon(htn, higherTaxon);
1113
                getClassificationService().saveTreeNode(higherTaxonNode);
1114
            }
1115
            rootNode = higherTaxonNode;
1116
        }
1117
        return higherTaxonNode;
1118
    }
1119

    
1120
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1121

    
1122
        Rank rank = guessRank(name);
1123

    
1124
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
1125
        taxonName.addSource(makeOriginalSource(state));
1126
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1127
        return taxonName;
1128
    }
1129

    
1130
    private Rank guessRank(String name) {
1131

    
1132
        // normalize
1133
        name = name.replaceAll("\\(.*\\)", "").trim();
1134

    
1135
        if(name.matches("^Plantae$|^Fungi$")){
1136
           return Rank.KINGDOM();
1137
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1138
           return rankFamilyIncertisSedis();
1139
        } else if(name.matches(".*phyta$|.*mycota$")){
1140
           return Rank.PHYLUM();
1141
        } else if(name.matches(".*phytina$|.*mycotina$")){
1142
           return Rank.SUBPHYLUM();
1143
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1144
            return rankUnrankedSupraGeneric();
1145
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1146
           return Rank.CLASS();
1147
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1148
           return Rank.SUBCLASS();
1149
        } else if(name.matches(".*ales$")){
1150
           return Rank.ORDER();
1151
        } else if(name.matches(".*ineae$")){
1152
           return Rank.SUBORDER();
1153
        } else if(name.matches(".*aceae$")){
1154
            return Rank.FAMILY();
1155
        } else if(name.matches(".*oideae$")){
1156
           return Rank.SUBFAMILY();
1157
        } else
1158
        //    if(name.matches(".*eae$")){
1159
        //    return Rank.TRIBE();
1160
        // } else
1161
            if(name.matches(".*inae$")){
1162
           return Rank.SUBTRIBE();
1163
        } else if(name.matches(".*ae$")){
1164
           return Rank.FAMILY();
1165
        }
1166
        return Rank.UNKNOWN_RANK();
1167
    }
1168

    
1169
    private Rank rankUnrankedSupraGeneric() {
1170

    
1171
        if(rankUnrankedSupraGeneric == null){
1172
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1173
            getTermService().save(rankUnrankedSupraGeneric);
1174
        }
1175
        return rankUnrankedSupraGeneric;
1176
    }
1177

    
1178
    private Rank rankFamilyIncertisSedis() {
1179

    
1180
        if(familyIncertisSedis == null){
1181
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1182
            getTermService().save(familyIncertisSedis);
1183
        }
1184
        return familyIncertisSedis;
1185
    }
1186

    
1187
    private AnnotationType annotationTypeCaveats(){
1188
        if(annotationTypeCaveats == null){
1189
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1190
            getTermService().save(annotationTypeCaveats);
1191
        }
1192
        return annotationTypeCaveats;
1193
    }
1194

    
1195

    
1196
    /**
1197
     * @param state
1198
     * @return
1199
     */
1200
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1201
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1202
    }
1203

    
1204

    
1205
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1206
        Reference ref = state.getReference(uuidRef);
1207
        if (ref == null){
1208
            ref = getReferenceService().find(uuidRef);
1209
            state.putReference(uuidRef, ref);
1210
        }
1211
        return ref;
1212
    }
1213

    
1214
    private MarkerType markerTypeFossil(){
1215
        if(this.markerTypeFossil == null){
1216
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1217
            getTermService().save(this.markerTypeFossil);
1218
        }
1219
        return markerTypeFossil;
1220
    }
1221

    
1222
    private String csvReportLine(String regId, String message, String ... fields){
1223
        StringBuilder out = new StringBuilder("regID#");
1224
        out.append(regId).append(",\"").append(message).append('"');
1225

    
1226
        for(String f : fields){
1227
            out.append(",\"").append(f).append('"');
1228
        }
1229
        return out.toString();
1230
    }
1231

    
1232

    
1233
}
(1-1/4)