Project

General

Profile

Download (53.6 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13
import eu.etaxonomy.cdm.common.CdmUtils;
14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16
import eu.etaxonomy.cdm.model.agent.Institution;
17
import eu.etaxonomy.cdm.model.agent.Person;
18
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
19
import eu.etaxonomy.cdm.model.common.*;
20
import eu.etaxonomy.cdm.model.name.*;
21
import eu.etaxonomy.cdm.model.occurrence.*;
22
import eu.etaxonomy.cdm.model.occurrence.Collection;
23
import eu.etaxonomy.cdm.model.reference.Reference;
24
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
25
import eu.etaxonomy.cdm.model.taxon.*;
26
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
27
import org.apache.commons.lang.ArrayUtils;
28
import org.apache.commons.lang.StringEscapeUtils;
29
import org.apache.commons.lang.StringUtils;
30
import org.apache.log4j.Level;
31
import org.apache.log4j.Logger;
32
import org.joda.time.DateTimeFieldType;
33
import org.joda.time.Partial;
34
import org.joda.time.format.DateTimeFormat;
35
import org.joda.time.format.DateTimeFormatter;
36
import org.springframework.stereotype.Component;
37

    
38
import java.util.*;
39
import java.util.regex.Matcher;
40
import java.util.regex.Pattern;
41

    
42
/**
43
 * @author a.mueller
44
 * @created 05.01.2016
45
 */
46

    
47
@Component("iAPTExcelImport")
48
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
49
    private static final long serialVersionUID = -747486709409732371L;
50
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
51
    public static final String ANNOTATION_MARKER_STRING = "[*]";
52

    
53

    
54
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
55

    
56
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
57

    
58
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
59
    private final static String HIGHERTAXON= "HigherTaxon";
60
    private final static String FULLNAME= "FullName";
61
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
62
    private final static String LITSTRING= "LitString";
63
    private final static String REGISTRATION= "Registration";
64
    private final static String TYPE= "Type";
65
    private final static String CAVEATS= "Caveats";
66
    private final static String FULLBASIONYM= "FullBasionym";
67
    private final static String FULLSYNSUBST= "FullSynSubst";
68
    private final static String NOTESTXT= "NotesTxt";
69
    private final static String REGDATE= "RegDate";
70
    private final static String NAMESTRING= "NameString";
71
    private final static String BASIONYMSTRING= "BasionymString";
72
    private final static String SYNSUBSTSTR= "SynSubstStr";
73
    private final static String AUTHORSTRING= "AuthorString";
74

    
75
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
76
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
77

    
78
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
79
    private static final Pattern[] datePatterns = new Pattern[]{
80
            // NOTE:
81
            // The order of the patterns is extremely important!!!
82
            //
83
            // all patterns cover the years 1700 - 1999
84
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
85
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
86
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
87
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
88
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
89
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
90
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
91
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
92
            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
93
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
94
        };
95
    protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
96

    
97
    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
98
    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
99
    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
100

    
101
    protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
102
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
103
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
104

    
105
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
106
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
107

    
108
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
109
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
110
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
111
            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
112
            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
113
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
114
    };
115

    
116
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
117

    
118
    static {
119
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
120
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
121
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
122
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
123
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
124
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
125
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
126
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
127
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
128
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
129

    
130
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
131

    
132
        for (String[] months: perLang) {
133
            for(int m = 1; m < 13; m++){
134
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
135
            }
136
        }
137

    
138
        // special cases
139
        monthFromNameMap.put("mar", 3);
140
        monthFromNameMap.put("dec", 12);
141
        monthFromNameMap.put("februari", 2);
142
        monthFromNameMap.put("març", 3);
143
    }
144

    
145

    
146
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
147

    
148
    private Map<String, Collection> collectionMap = new HashMap<>();
149

    
150

    
151
    enum TypesName {
152
        fieldUnit, holotype, isotype;
153

    
154
        public SpecimenTypeDesignationStatus status(){
155
            switch (this) {
156
                case holotype:
157
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
158
                case isotype:
159
                    return SpecimenTypeDesignationStatus.ISOTYPE();
160
                default:
161
                    return null;
162
            }
163
        }
164
    }
165

    
166
    private MarkerType markerTypeFossil = null;
167
    private Rank rankUnrankedSupraGeneric = null;
168
    private Rank familyIncertisSedis = null;
169
    private AnnotationType annotationTypeCaveats = null;
170

    
171
    private Reference bookVariedadesTradicionales = null;
172

    
173
    /**
174
     * HACK for unit simple testing
175
     */
176
    boolean _testMode = System.getProperty("TEST_MODE") != null;
177

    
178
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
179
                            TaxonNode higherTaxonNode, boolean isFossil) {
180

    
181
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
182
        String regStr = getValue(record, REGISTRATION, true);
183
        String titleCacheStr = getValue(record, FULLNAME, true);
184
        String nameStr = getValue(record, NAMESTRING, true);
185
        String authorStr = getValue(record, AUTHORSTRING, true);
186
        String nomRefStr = getValue(record, LITSTRING, true);
187
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
188
        String notesTxt = getValue(record, NOTESTXT, true);
189
        String caveats = getValue(record, CAVEATS, true);
190
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
191
        String fullBasionymStr = getValue(record, FULLBASIONYM, true);
192
        String basionymNameStr = getValue(record, FULLBASIONYM, true);
193
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
194
        String typeStr = getValue(record, TYPE, true);
195

    
196

    
197
        String nomRefTitle = null;
198
        String nomRefDetail;
199
        String nomRefPupDate = null;
200
        String nomRefIssue = null;
201
        Partial pupDate = null;
202

    
203
        boolean restoreOriginalReference = false;
204
        boolean nameIsValid = true;
205

    
206
        // preprocess nomRef: separate citation, reference detail, publishing date
207
        if(!StringUtils.isEmpty(nomRefStr)){
208
            nomRefStr = nomRefStr.trim();
209

    
210
            // handle the special case which is hard to parse:
211
            //
212
            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
213
            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
214

    
215
                if(bookVariedadesTradicionales == null){
216
                    bookVariedadesTradicionales = ReferenceFactory.newBook();
217
                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
218
                    bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
219
                    getReferenceService().save(bookVariedadesTradicionales);
220
                }
221
                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
222
                restoreOriginalReference = true;
223
            }
224

    
225
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
226
            if(m.matches()){
227
                nomRefTitle = m.group("title");
228
                nomRefDetail = m.group("detail");
229
                nomRefPupDate = m.group("date").trim();
230
                nomRefIssue = m.group("issue");
231

    
232
                pupDate = parseDate(regNumber, nomRefPupDate);
233
                if (pupDate != null) {
234
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
235
                } else {
236
                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
237
                }
238
            } else {
239
                nomRefTitle = nomRefStr;
240
            }
241
        }
242

    
243
        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
244

    
245
        // always add the original strings of parsed data as annotation
246
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
247
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
248
                        "\n -  '" + TYPE + "': " + typeStr +
249
                        "\n -  '" + REGISTRATION  + "': " + regStr
250
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
251

    
252
        if(restoreOriginalReference){
253
            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
254
        }
255
        if(pupDate != null) {
256
            taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
257
        }
258
        if(nomRefIssue != null) {
259
            ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
260
        }
261

    
262

    
263
        if(!StringUtils.isEmpty(notesTxt)){
264
            notesTxt = notesTxt.replace("Notes: ", "").trim();
265
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
266
            nameIsValid = false;
267

    
268
        }
269
        if(!StringUtils.isEmpty(caveats)){
270
            caveats = caveats.replace("Caveats: ", "").trim();
271
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
272
            nameIsValid = false;
273
        }
274

    
275
        if(nameIsValid){
276
            // Status is always considered valid if no notes and cavets are set
277
            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
278
        }
279

    
280
        getNameService().save(taxonName);
281

    
282
        // Namerelations
283
        if(!StringUtils.isEmpty(authorsSpelling)){
284
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
285

    
286
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
287
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
288

    
289
            ArrayUtils.reverse(authorSpellingTokens);
290
            ArrayUtils.reverse(nameStrTokens);
291

    
292
            for (int i = 0; i < nameStrTokens.length; i++){
293
                if(i < authorSpellingTokens.length){
294
                    nameStrTokens[i] = authorSpellingTokens[i];
295
                }
296
            }
297
            ArrayUtils.reverse(nameStrTokens);
298

    
299
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
300
            // build the fullnameString of the misspelled name
301
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
302

    
303
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
304
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
305
            getNameService().save(misspelledName);
306
        }
307

    
308
        // Replaced Synonyms
309
        if(!StringUtils.isEmpty(fullSynSubstStr)){
310
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
311
            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
312
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
313
            getNameService().save(replacedSynonymName);
314
        }
315

    
316
        Reference sec = state.getConfig().getSecReference();
317
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
318

    
319
        // Basionym
320
        if(fullBasionymStr != null){
321
            fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
322
            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
323
            BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
324
            getNameService().save(basionym);
325
            taxonName.addBasionym(basionym);
326

    
327
            Synonym syn = Synonym.NewInstance(basionym, sec);
328
            taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
329
            getTaxonService().save(syn);
330
        }
331

    
332
        // Markers
333
        if(isFossil){
334
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
335
        }
336

    
337
        // Types
338
        if(!StringUtils.isEmpty(typeStr)){
339

    
340
            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
341
                makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
342
            } else {
343
                makeNameTypeData(typeStr, taxonName, regNumber, state);
344
            }
345
        }
346

    
347
        getTaxonService().save(taxon);
348

    
349
        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
350
            // try to find the genus, it should have been imported already, Genera are coming first in the import file
351
            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
352
            if(genus != null){
353
                higherTaxonNode = genus.getTaxonNodes().iterator().next();
354
            } else {
355
                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
356
            }
357
        }
358

    
359
        if(higherTaxonNode != null){
360
            higherTaxonNode.addChildTaxon(taxon, null, null);
361
            getTaxonNodeService().save(higherTaxonNode);
362
        }
363

    
364
        if(taxonName.getRank().isGenus()){
365
            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
366
        }
367

    
368
        return taxon;
369
    }
370

    
371
    private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
372

    
373
        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
374

    
375
        if(m.matches()){
376
            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
377
            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
378
            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
379
            if(fieldUnit == null) {
380
                // create a field unit with only a titleCache using the fieldUnitStr substring
381
                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
382
                fieldUnit = FieldUnit.NewInstance();
383
                fieldUnit.setTitleCache(fieldUnitStr, true);
384
                getOccurrenceService().save(fieldUnit);
385
            }
386
            getOccurrenceService().save(fieldUnit);
387

    
388
            SpecimenOrObservationType specimenType;
389
            if(isFossil){
390
                specimenType = SpecimenOrObservationType.Fossil;
391
            } else {
392
                specimenType = SpecimenOrObservationType.PreservedSpecimen;
393
            }
394

    
395
            // all others ..
396
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
397
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
398

    
399
        } else {
400
            // create a field unit with only a titleCache using the full typeStr
401
            FieldUnit fieldUnit = FieldUnit.NewInstance();
402
            fieldUnit.setTitleCache(typeStr, true);
403
            getOccurrenceService().save(fieldUnit);
404
            logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
405
        }
406
        getNameService().save(taxonName);
407
    }
408

    
409
    private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
410

    
411
        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
412
        if(nameStr.isEmpty()) {
413
            return;
414
        }
415

    
416
        String basionymNameStr = null;
417
        String noteStr = null;
418
        String agentStr = null;
419

    
420
        Matcher m;
421

    
422
        if(typeStr.startsWith("not to be indicated")){
423
            // Special case:
424
            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
425
            // FIXME
426
            m = typeNameSpecialSplitPattern.matcher(nameStr);
427
            if(m.matches()){
428
                nameStr = m.group("name");
429
                noteStr = m.group("note");
430
                agentStr = m.group("agent");
431
                // TODO better import of agent?
432
                if(agentStr != null){
433
                    noteStr = noteStr + ": " + agentStr;
434
                }
435
            }
436
        } else {
437
            // Generic case
438
            m = typeNameBasionymPattern.matcher(nameStr);
439
            if (m.find()) {
440
                basionymNameStr = m.group("basionymName");
441
                if (basionymNameStr != null) {
442
                    nameStr = nameStr.replace(m.group(0), "");
443
                }
444
            }
445

    
446
            m = typeNameNotePattern.matcher(nameStr);
447
            if (m.find()) {
448
                noteStr = m.group(1);
449
                if (noteStr != null) {
450
                    nameStr = nameStr.replace(m.group(0), "");
451
                }
452
            }
453
        }
454

    
455
        BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
456

    
457
        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
458
            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
459
        }
460

    
461
        if(basionymNameStr != null){
462
            BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
463
            getNameService().save(basionymName);
464
            typeName.addBasionym(basionymName);
465
        }
466

    
467

    
468
        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
469
        nameTypeDesignation.setTypeName(typeName);
470
        getNameService().save(typeName);
471

    
472
        if(noteStr != null){
473
            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
474
        }
475
        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
476

    
477
    }
478

    
479
    /**
480
     * Currently only parses the collector, fieldNumber and the collection date.
481
     *
482
     * @param fieldUnitStr
483
     * @param regNumber
484
     * @param state
485
     * @return null if the fieldUnitStr could not be parsed
486
     */
487
    protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
488

    
489
        FieldUnit fieldUnit = null;
490

    
491
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
492
        if(m1.matches()){
493

    
494
            String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
495
            String removal = m1.group(1);
496
            if(collectorData == null){
497
                collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
498
                removal = m1.group(3);
499
            }
500
            if(collectorData == null){
501
                collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
502
                removal = null;
503
            }
504
            if(collectorData == null){
505
                return null;
506
            }
507

    
508
            // the fieldUnitStr is parsable
509
            // remove all collectorData from the fieldUnitStr and use the rest as locality
510
            String locality = null;
511
            if(removal != null){
512
                locality = fieldUnitStr.replace(removal, "");
513
            }
514

    
515
            String collectorStr = null;
516
            String detailStr = null;
517
            Partial date = null;
518
            String fieldNumber = null;
519

    
520
            Matcher m2 = collectionDataPattern.matcher(collectorData);
521
            if(m2.matches()){
522
                collectorStr = m2.group("collector");
523
                detailStr = m2.group("detail");
524

    
525
                // Try to make sense of the detailStr
526
                if(detailStr != null){
527
                    detailStr = detailStr.trim();
528
                    // 1. try to parse as date
529
                    date = parseDate(regNumber, detailStr);
530
                    if(date == null){
531
                        // 2. try to parse as number
532
                        if(collectorsNumber.matcher(detailStr).matches()){
533
                            fieldNumber = detailStr;
534
                        }
535
                    }
536
                }
537
                if(date == null && fieldNumber == null){
538
                    // detailed parsing not possible, so need fo fallback
539
                    collectorStr = collectorData;
540
                }
541
            }
542

    
543
            if(collectorStr == null) {
544
                collectorStr = collectorData;
545
            }
546

    
547
            fieldUnit = FieldUnit.NewInstance();
548
            GatheringEvent ge = GatheringEvent.NewInstance();
549
            if(locality != null){
550
                ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
551
            }
552

    
553
            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
554
            if(agent == null) {
555
                agent = Person.NewTitledInstance(collectorStr);
556
                getAgentService().save(agent);
557
                state.putAgentBase(collectorStr, agent);
558
            }
559
            ge.setCollector(agent);
560

    
561
            if(date != null){
562
                ge.setGatheringDate(date);
563
            }
564

    
565
            getEventBaseService().save(ge);
566
            fieldUnit.setGatheringEvent(ge);
567

    
568
            if(fieldNumber != null) {
569
                fieldUnit.setFieldNumber(fieldNumber);
570
            }
571
            getOccurrenceService().save(fieldUnit);
572

    
573
        }
574

    
575
        return fieldUnit;
576
    }
577

    
578
    protected Partial parseDate(String regNumber, String dateStr) {
579

    
580
        Partial pupDate = null;
581
        boolean parseError = false;
582

    
583
        String day = null;
584
        String month = null;
585
        String monthName = null;
586
        String year = null;
587

    
588
        for(Pattern p : datePatterns){
589
            Matcher m2 = p.matcher(dateStr);
590
            if(m2.matches()){
591
                try {
592
                    year = m2.group("year");
593
                } catch (IllegalArgumentException e){
594
                    // named capture group not found
595
                }
596
                try {
597
                    month = m2.group("month");
598
                } catch (IllegalArgumentException e){
599
                    // named capture group not found
600
                }
601

    
602
                try {
603
                    monthName = m2.group("monthName");
604
                    month = monthFromName(monthName, regNumber);
605
                    if(month == null){
606
                        parseError = true;
607
                    }
608
                } catch (IllegalArgumentException e){
609
                    // named capture group not found
610
                }
611
                try {
612
                    day = m2.group("day");
613
                } catch (IllegalArgumentException e){
614
                    // named capture group not found
615
                }
616

    
617
                if(year != null){
618
                    if (year.length() == 2) {
619
                        // it is an abbreviated year from the 19** years
620
                        year = "19" + year;
621
                    }
622
                    break;
623
                } else {
624
                    parseError = true;
625
                }
626
            }
627
        }
628
        if(year == null){
629
            parseError = true;
630
        }
631
        List<DateTimeFieldType> types = new ArrayList<>();
632
        List<Integer> values = new ArrayList<>();
633
        if(!parseError) {
634
            types.add(DateTimeFieldType.year());
635
            values.add(Integer.parseInt(year));
636
            if (month != null) {
637
                types.add(DateTimeFieldType.monthOfYear());
638
                values.add(Integer.parseInt(month));
639
            }
640
            if (day != null) {
641
                types.add(DateTimeFieldType.dayOfMonth());
642
                values.add(Integer.parseInt(day));
643
            }
644
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
645
        }
646
        return pupDate;
647
    }
648

    
649
    private String monthFromName(String monthName, String regNumber) {
650

    
651
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
652
        if(month == null){
653
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
654
            return null;
655
        } else {
656
            return month.toString();
657
        }
658
    }
659

    
660

    
661
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
662

    
663
        if(StringUtils.isEmpty(typeStr)){
664
            return;
665
        }
666
        typeStr = typeStr.trim().replaceAll("\\.$", "");
667

    
668
        Collection collection = null;
669
        DerivedUnit specimen = null;
670

    
671
        List<DerivedUnit> specimens = new ArrayList<>();
672
        if(multiple){
673
            String[] tokens = typeStr.split("\\s?,\\s?");
674
            for (String t : tokens) {
675
                // command to  list all complex parsabel types:
676
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
677
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
678

    
679
                if(!t.isEmpty()){
680
                    // trying to parse the string
681
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
682
                    if(specimen != null){
683
                        specimens.add(specimen);
684
                    } else {
685
                        // parsing was not successful make simple specimen
686
                        specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
687
                    }
688
                }
689
            }
690
        } else {
691
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
692
            if(specimen != null) {
693
                specimens.add(specimen);
694
                // remember current collection
695
                collection = specimen.getCollection();
696
            } else {
697
                // parsing was not successful make simple specimen
698
                specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
699
            }
700
        }
701

    
702
        for(DerivedUnit s : specimens){
703
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
704
       }
705
    }
706

    
707
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
708
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
709
        facade.setTitleCache(titleCache.trim(), true);
710
        specimen = facade.innerDerivedUnit();
711
        return specimen;
712
    }
713

    
714
    /**
715
     *
716
     * @param fieldUnit
717
     * @param typeName
718
     * @param collection
719
     * @param text
720
     * @param regNumber
721
     * @return
722
     */
723
    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
724

    
725
        DerivedUnit specimen = null;
726

    
727
        String collectionCode = null;
728
        String collectionTitle = null;
729
        String subCollectionStr = null;
730
        String instituteStr = null;
731
        String accessionNumber = null;
732

    
733
        boolean unusualAccessionNumber = false;
734

    
735
        text = text.trim();
736

    
737
        // 1.  For Isotypes often the accession number is noted alone if the
738
        //     preceeding entry has a collection code.
739
        if(typeName .equals(TypesName.isotype) && collection != null){
740
            Matcher m = accessionNumberOnlyPattern.matcher(text);
741
            if(m.matches()){
742
                try {
743
                    accessionNumber = m.group("accNumber");
744
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
745
                } catch (IllegalArgumentException e){
746
                    // match group acc_number not found
747
                }
748
            }
749
        }
750

    
751
        //2. try it the 'normal' way
752
        if(specimen == null) {
753
            for (Pattern p : specimenTypePatterns) {
754
                Matcher m = p.matcher(text);
755
                if (m.matches()) {
756
                    // collection code or collectionTitle is mandatory
757
                    try {
758
                        collectionCode = m.group("colCode");
759
                    } catch (IllegalArgumentException e){
760
                        // match group colCode not found
761
                    }
762

    
763
                    try {
764
                        instituteStr = m.group("institute");
765
                    } catch (IllegalArgumentException e){
766
                        // match group col_name not found
767
                    }
768

    
769
                    try {
770
                        subCollectionStr = m.group("subCollection");
771
                    } catch (IllegalArgumentException e){
772
                        // match group subCollection not found
773
                    }
774
                    try {
775
                        accessionNumber = m.group("accNumber");
776

    
777
                        // try to improve the accessionNumber
778
                        if(accessionNumber!= null) {
779
                            accessionNumber = accessionNumber.trim();
780
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
781
                            String betterAccessionNumber = null;
782
                            if (m2.matches()) {
783
                                try {
784
                                    betterAccessionNumber = m.group("accNumber");
785
                                } catch (IllegalArgumentException e) {
786
                                    // match group acc_number not found
787
                                }
788
                            }
789
                            if (betterAccessionNumber != null) {
790
                                accessionNumber = betterAccessionNumber;
791
                            } else {
792
                                unusualAccessionNumber = true;
793
                            }
794
                        }
795

    
796
                    } catch (IllegalArgumentException e){
797
                        // match group acc_number not found
798
                    }
799

    
800
                    if(collectionCode == null && instituteStr == null){
801
                        logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
802
                        continue;
803
                    }
804
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
805
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
806
                    break;
807
                }
808
            }
809
        }
810
        if(specimen == null) {
811
            logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
812
        }
813
        if(unusualAccessionNumber){
814
            logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
815
        }
816
        return specimen;
817
    }
818

    
819
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
820

    
821
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
822
        facade.setCollection(collection);
823
        if(accessionNumber != null){
824
            facade.setAccessionNumber(accessionNumber);
825
        }
826
        return facade.innerDerivedUnit();
827
    }
828

    
829
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
830
                                            String authorStr, String nomRefTitle) {
831

    
832
        BotanicalName taxonName;// cache field for the taxonName.titleCache
833
        String taxonNameTitleCache = null;
834
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
835

    
836
        // TitleCache preprocessing
837
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
838
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
839
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
840
            if(authorStr != null) {
841
                authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
842
            }
843
        }
844

    
845
        // parse the full taxon name
846
        if(!StringUtils.isEmpty(nomRefTitle)){
847
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
848
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
849
            logger.debug(":::::" + taxonFullNameStr);
850
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
851
        } else {
852
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
853
        }
854

    
855
        taxonNameTitleCache = taxonName.getTitleCache().trim();
856
        if (taxonName.isProtectedTitleCache()) {
857
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
858
        } else {
859

    
860
            boolean doRestoreTitleCacheStr = false;
861

    
862
            // Check if titleCache and nameCache are plausible
863
            String titleCacheCompareStr = titleCacheStr;
864
            String nameCache = taxonName.getNameCache();
865
            String nameCompareStr = nameStr;
866
            if(taxonName.isBinomHybrid()){
867
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
868
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
869
            }
870
            if(taxonName.isMonomHybrid()){
871
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
872
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
873
            }
874
            if(authorStr != null && authorStr.contains(" et ")){
875
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
876
            }
877
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
878
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
879
                doRestoreTitleCacheStr = true;
880
            }
881
            if (!nameCache.trim().equals(nameCompareStr)) {
882
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
883
            }
884

    
885
            //  Author
886
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
887
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
888
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
889
            //    doRestoreTitleCacheStr = true;
890
            //}
891

    
892
            if(doRestoreTitleCacheStr){
893
                taxonName.setTitleCache(titleCacheStr, true);
894
            }
895

    
896
            // deduplicate
897
            replaceAuthorNamesAndNomRef(state, taxonName);
898
        }
899

    
900
        // Annotations
901
        if(!nameAnnotations.isEmpty()){
902
            for(String text : nameAnnotations.keySet()){
903
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
904
            }
905
        }
906

    
907
        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
908

    
909
        getNameService().save(taxonName);
910

    
911
        return taxonName;
912
    }
913

    
914
    /**
915
     * @param state
916
     * @return
917
     */
918
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
919

    
920
     //   Classification classification = state.getClassification();
921
     //   if (classification == null){
922
     //       IAPTImportConfigurator config = state.getConfig();
923
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
924
     //       classification.setUuid(config.getClassificationUuid());
925
     //       classification.setReference(config.getSecReference());
926
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
927
     //   }
928
        TaxonNode rootNode = state.getRootNode();
929
        if (rootNode == null){
930
            rootNode = getTaxonNodeService().find(ROOT_UUID);
931
        }
932
        if (rootNode == null){
933
            Classification classification = state.getClassification();
934
            if (classification == null){
935
                Reference sec = state.getSecReference();
936
                String classificationName = state.getConfig().getClassificationName();
937
                Language language = Language.DEFAULT();
938
                classification = Classification.NewInstance(classificationName, sec, language);
939
                state.setClassification(classification);
940
                classification.setUuid(state.getConfig().getClassificationUuid());
941
                classification.getRootNode().setUuid(ROOT_UUID);
942
                getClassificationService().save(classification);
943
            }
944
            rootNode = classification.getRootNode();
945
            state.setRootNode(rootNode);
946
        }
947
        return rootNode;
948
    }
949

    
950
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
951

    
952
        Collection superCollection = null;
953
        if(subCollectionStr != null){
954
            superCollection = getCollection(collectionCode, instituteStr, null);
955
            collectionCode = subCollectionStr;
956
            instituteStr = null;
957
        }
958

    
959
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
960

    
961
        Collection collection = collectionMap.get(key);
962

    
963
        if(collection == null) {
964
            collection = Collection.NewInstance();
965
            collection.setCode(collectionCode);
966
            if(instituteStr != null){
967
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
968
            }
969
            if(superCollection != null){
970
                collection.setSuperCollection(superCollection);
971
            }
972
            collectionMap.put(key, collection);
973
            if(!_testMode) {
974
                getCollectionService().save(collection);
975
            }
976
        }
977

    
978
        return collection;
979
    }
980

    
981

    
982
    /**
983
     * @param record
984
     * @param originalKey
985
     * @param doUnescapeHtmlEntities
986
     * @return
987
     */
988
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
989
        String value = record.get(originalKey);
990

    
991
        value = fixCharacters(value);
992

    
993
        if (! StringUtils.isBlank(value)) {
994
        	if (logger.isDebugEnabled()) {
995
        	    logger.debug(originalKey + ": " + value);
996
        	}
997
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
998
            if(doUnescapeHtmlEntities){
999
                value = StringEscapeUtils.unescapeHtml(value);
1000
            }
1001
        	return value.trim();
1002
        }else{
1003
        	return null;
1004
        }
1005
    }
1006

    
1007
    /**
1008
     * Fixes broken characters.
1009
     * For details see
1010
     * http://dev.e-taxonomy.eu/redmine/issues/6035
1011
     *
1012
     * @param value
1013
     * @return
1014
     */
1015
    private String fixCharacters(String value) {
1016

    
1017
        value = StringUtils.replace(value, "s$K", "š");
1018
        value = StringUtils.replace(value, "n$K", "ň");
1019
        value = StringUtils.replace(value, "e$K", "ě");
1020
        value = StringUtils.replace(value, "r$K", "ř");
1021
        value = StringUtils.replace(value, "c$K", "č");
1022
        value = StringUtils.replace(value, "z$K", "ž");
1023
        value = StringUtils.replace(value, "S>U$K", "Š");
1024
        value = StringUtils.replace(value, "C>U$K", "Č");
1025
        value = StringUtils.replace(value, "R>U$K", "Ř");
1026
        value = StringUtils.replace(value, "Z>U$K", "Ž");
1027
        value = StringUtils.replace(value, "g$K", "ǧ");
1028
        value = StringUtils.replace(value, "s$A", "ś");
1029
        value = StringUtils.replace(value, "n$A", "ń");
1030
        value = StringUtils.replace(value, "c$A", "ć");
1031
        value = StringUtils.replace(value, "e$E", "ę");
1032
        value = StringUtils.replace(value, "o$H", "õ");
1033
        value = StringUtils.replace(value, "s$C", "ş");
1034
        value = StringUtils.replace(value, "t$C", "ț");
1035
        value = StringUtils.replace(value, "S>U$C", "Ş");
1036
        value = StringUtils.replace(value, "a$O", "å");
1037
        value = StringUtils.replace(value, "A>U$O", "Å");
1038
        value = StringUtils.replace(value, "u$O", "ů");
1039
        value = StringUtils.replace(value, "g$B", "ğ");
1040
        value = StringUtils.replace(value, "g$B", "ĕ");
1041
        value = StringUtils.replace(value, "a$B", "ă");
1042
        value = StringUtils.replace(value, "l$/", "ł");
1043
        value = StringUtils.replace(value, ">i", "ı");
1044
        value = StringUtils.replace(value, "i$U", "ï");
1045
        // Special-cases
1046
        value = StringUtils.replace(value, "&yacute", "ý");
1047
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1048
        value = StringUtils.replace(value, "E>U$D", "З");
1049
        value = StringUtils.replace(value, "S>U$E", "Ş");
1050
        value = StringUtils.replace(value, "s$E", "ş");
1051

    
1052
        value = StringUtils.replace(value, "c$k", "č");
1053
        value = StringUtils.replace(value, " U$K", " Š");
1054

    
1055
        value = StringUtils.replace(value, "O>U>!", "Ø");
1056
        value = StringUtils.replace(value, "o>!", "ø");
1057
        value = StringUtils.replace(value, "S$K", "Ŝ");
1058
        value = StringUtils.replace(value, ">l", "ğ");
1059

    
1060
        value = StringUtils.replace(value, "§B>i", "ł");
1061

    
1062

    
1063

    
1064
        return value;
1065
    }
1066

    
1067

    
1068
    /**
1069
	 *  Stores taxa records in DB
1070
	 */
1071
	@Override
1072
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1073

    
1074
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
1075
        logger.setLevel(Level.DEBUG);
1076
        HashMap<String, String> record = state.getOriginalRecord();
1077
        logger.debug(lineNumber + record.toString());
1078

    
1079
        Set<String> keys = record.keySet();
1080
        for (String key: keys) {
1081
            if (! expectedKeys.contains(key)){
1082
                logger.warn(lineNumber + "Unexpected Key: " + key);
1083
            }
1084
        }
1085

    
1086
        String reg_id = record.get(REGISTRATIONNO_PK);
1087

    
1088
        //higherTaxon
1089
        String higherTaxaString = record.get(HIGHERTAXON);
1090
        boolean isFossil = false;
1091
        if(higherTaxaString.startsWith("FOSSIL ")){
1092
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1093
            isFossil = true;
1094
        }
1095
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1096

    
1097
       //Taxon
1098
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1099
        if (taxon == null){
1100
            logger.warn(lineNumber + "taxon could not be created and is null");
1101
            return;
1102
        }
1103
        ((IAPTImportState)state).setCurrentTaxon(taxon);
1104

    
1105

    
1106
        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1107
		return;
1108
    }
1109

    
1110
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1111
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1112
        TaxonNode higherTaxonNode = null;
1113

    
1114
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
1115
        for (String htn :  higherTaxaNames) {
1116
            htn = StringUtils.capitalize(htn.trim());
1117
            Taxon higherTaxon = state.getHigherTaxon(htn);
1118
            if (higherTaxon != null){
1119
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1120
            }else{
1121
                BotanicalName name = makeHigherTaxonName(state, htn);
1122
                Reference sec = state.getSecReference();
1123
                higherTaxon = Taxon.NewInstance(name, sec);
1124
                getTaxonService().save(higherTaxon);
1125
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1126
                state.putHigherTaxon(htn, higherTaxon);
1127
                getClassificationService().saveTreeNode(higherTaxonNode);
1128
            }
1129
            rootNode = higherTaxonNode;
1130
        }
1131
        return higherTaxonNode;
1132
    }
1133

    
1134
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1135

    
1136
        Rank rank = guessRank(name);
1137

    
1138
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
1139
        taxonName.addSource(makeOriginalSource(state));
1140
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1141
        return taxonName;
1142
    }
1143

    
1144
    private Rank guessRank(String name) {
1145

    
1146
        // normalize
1147
        name = name.replaceAll("\\(.*\\)", "").trim();
1148

    
1149
        if(name.matches("^Plantae$|^Fungi$")){
1150
           return Rank.KINGDOM();
1151
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1152
           return rankFamilyIncertisSedis();
1153
        } else if(name.matches(".*phyta$|.*mycota$")){
1154
           return Rank.PHYLUM();
1155
        } else if(name.matches(".*phytina$|.*mycotina$")){
1156
           return Rank.SUBPHYLUM();
1157
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1158
            return rankUnrankedSupraGeneric();
1159
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1160
           return Rank.CLASS();
1161
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1162
           return Rank.SUBCLASS();
1163
        } else if(name.matches(".*ales$")){
1164
           return Rank.ORDER();
1165
        } else if(name.matches(".*ineae$")){
1166
           return Rank.SUBORDER();
1167
        } else if(name.matches(".*aceae$")){
1168
            return Rank.FAMILY();
1169
        } else if(name.matches(".*oideae$")){
1170
           return Rank.SUBFAMILY();
1171
        } else
1172
        //    if(name.matches(".*eae$")){
1173
        //    return Rank.TRIBE();
1174
        // } else
1175
            if(name.matches(".*inae$")){
1176
           return Rank.SUBTRIBE();
1177
        } else if(name.matches(".*ae$")){
1178
           return Rank.FAMILY();
1179
        }
1180
        return Rank.UNKNOWN_RANK();
1181
    }
1182

    
1183
    private Rank rankUnrankedSupraGeneric() {
1184

    
1185
        if(rankUnrankedSupraGeneric == null){
1186
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1187
            getTermService().save(rankUnrankedSupraGeneric);
1188
        }
1189
        return rankUnrankedSupraGeneric;
1190
    }
1191

    
1192
    private Rank rankFamilyIncertisSedis() {
1193

    
1194
        if(familyIncertisSedis == null){
1195
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1196
            getTermService().save(familyIncertisSedis);
1197
        }
1198
        return familyIncertisSedis;
1199
    }
1200

    
1201
    private AnnotationType annotationTypeCaveats(){
1202
        if(annotationTypeCaveats == null){
1203
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1204
            getTermService().save(annotationTypeCaveats);
1205
        }
1206
        return annotationTypeCaveats;
1207
    }
1208

    
1209

    
1210
    /**
1211
     * @param state
1212
     * @return
1213
     */
1214
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1215
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1216
    }
1217

    
1218

    
1219
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1220
        Reference ref = state.getReference(uuidRef);
1221
        if (ref == null){
1222
            ref = getReferenceService().find(uuidRef);
1223
            state.putReference(uuidRef, ref);
1224
        }
1225
        return ref;
1226
    }
1227

    
1228
    private MarkerType markerTypeFossil(){
1229
        if(this.markerTypeFossil == null){
1230
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1231
            getTermService().save(this.markerTypeFossil);
1232
        }
1233
        return markerTypeFossil;
1234
    }
1235

    
1236
    private String csvReportLine(String regId, String message, String ... fields){
1237
        StringBuilder out = new StringBuilder("regID#");
1238
        out.append(regId).append(",\"").append(message).append('"');
1239

    
1240
        for(String f : fields){
1241
            out.append(",\"").append(f).append('"');
1242
        }
1243
        return out.toString();
1244
    }
1245

    
1246

    
1247
}
(1-1/4)