Project

General

Profile

Download (57.8 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import com.fasterxml.jackson.core.JsonProcessingException;
13
import com.fasterxml.jackson.databind.ObjectMapper;
14
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
15
import eu.etaxonomy.cdm.common.CdmUtils;
16
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
17
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
18
import eu.etaxonomy.cdm.model.agent.Institution;
19
import eu.etaxonomy.cdm.model.agent.Person;
20
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
21
import eu.etaxonomy.cdm.model.common.*;
22
import eu.etaxonomy.cdm.model.name.*;
23
import eu.etaxonomy.cdm.model.occurrence.*;
24
import eu.etaxonomy.cdm.model.occurrence.Collection;
25
import eu.etaxonomy.cdm.model.reference.Reference;
26
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
27
import eu.etaxonomy.cdm.model.taxon.*;
28
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
29
import org.apache.commons.lang.ArrayUtils;
30
import org.apache.commons.lang.StringEscapeUtils;
31
import org.apache.commons.lang.StringUtils;
32
import org.apache.log4j.Level;
33
import org.apache.log4j.Logger;
34
import org.joda.time.DateTimeFieldType;
35
import org.joda.time.Partial;
36
import org.joda.time.format.DateTimeFormat;
37
import org.joda.time.format.DateTimeFormatter;
38
import org.springframework.stereotype.Component;
39

    
40
import java.util.*;
41
import java.util.regex.Matcher;
42
import java.util.regex.Pattern;
43

    
44
/**
45
 * @author a.mueller
46
 * @created 05.01.2016
47
 */
48

    
49
@Component("iAPTExcelImport")
50
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
51
    private static final long serialVersionUID = -747486709409732371L;
52
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
53
    public static final String ANNOTATION_MARKER_STRING = "[*]";
54

    
55

    
56
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
57

    
58
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
59

    
60
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
61
    private final static String HIGHERTAXON= "HigherTaxon";
62
    private final static String FULLNAME= "FullName";
63
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
64
    private final static String LITSTRING= "LitString";
65
    private final static String REGISTRATION= "Registration";
66
    private final static String TYPE= "Type";
67
    private final static String CAVEATS= "Caveats";
68
    private final static String FULLBASIONYM= "FullBasionym";
69
    private final static String FULLSYNSUBST= "FullSynSubst";
70
    private final static String NOTESTXT= "NotesTxt";
71
    private final static String REGDATE= "RegDate";
72
    private final static String NAMESTRING= "NameString";
73
    private final static String BASIONYMSTRING= "BasionymString";
74
    private final static String SYNSUBSTSTR= "SynSubstStr";
75
    private final static String AUTHORSTRING= "AuthorString";
76

    
77
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
78
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
79

    
80
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
81
    private static final Pattern[] datePatterns = new Pattern[]{
82
            // NOTE:
83
            // The order of the patterns is extremely important!!!
84
            //
85
            // all patterns cover the years 1700 - 1999
86
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
87
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
88
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
89
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
90
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
91
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
92
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
93
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
94
            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
95
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
96
        };
97
    protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
98

    
99
    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
100
    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
101
    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
102

    
103
    protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
104
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
105
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
106

    
107
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
108
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
109

    
110
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
111
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
112
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
113
            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
114
            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
115
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
116
    };
117

    
118

    
119
    private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
120

    
121
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
122

    
123
    static {
124
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
125
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
126
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
127
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
128
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
129
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
130
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
131
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
132
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
133
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
134

    
135
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
136

    
137
        for (String[] months: perLang) {
138
            for(int m = 1; m < 13; m++){
139
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
140
            }
141
        }
142

    
143
        // special cases
144
        monthFromNameMap.put("mar", 3);
145
        monthFromNameMap.put("dec", 12);
146
        monthFromNameMap.put("februari", 2);
147
        monthFromNameMap.put("març", 3);
148
    }
149

    
150

    
151
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
152

    
153
    private Map<String, Collection> collectionMap = new HashMap<>();
154

    
155
    private ExtensionType extensionTypeIAPTRegData = null;
156

    
157
    private Set<String> nameSet = new HashSet<>();
158
    private DefinedTermBase duplicateRegistration = null;
159

    
160
    enum TypesName {
161
        fieldUnit, holotype, isotype;
162

    
163
        public SpecimenTypeDesignationStatus status(){
164
            switch (this) {
165
                case holotype:
166
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
167
                case isotype:
168
                    return SpecimenTypeDesignationStatus.ISOTYPE();
169
                default:
170
                    return null;
171
            }
172
        }
173
    }
174

    
175
    private MarkerType markerTypeFossil = null;
176
    private Rank rankUnrankedSupraGeneric = null;
177
    private Rank familyIncertisSedis = null;
178
    private AnnotationType annotationTypeCaveats = null;
179

    
180
    private Reference bookVariedadesTradicionales = null;
181

    
182
    /**
183
     * HACK for unit simple testing
184
     */
185
    boolean _testMode = System.getProperty("TEST_MODE") != null;
186

    
187
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
188
                            TaxonNode higherTaxonNode, boolean isFossil) {
189

    
190
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
191
        String regStr = getValue(record, REGISTRATION, true);
192
        String titleCacheStr = getValue(record, FULLNAME, true);
193
        String nameStr = getValue(record, NAMESTRING, true);
194
        String authorStr = getValue(record, AUTHORSTRING, true);
195
        String nomRefStr = getValue(record, LITSTRING, true);
196
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
197
        String notesTxt = getValue(record, NOTESTXT, true);
198
        String caveats = getValue(record, CAVEATS, true);
199
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
200
        String fullBasionymStr = getValue(record, FULLBASIONYM, true);
201
        String basionymNameStr = getValue(record, FULLBASIONYM, true);
202
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
203
        String typeStr = getValue(record, TYPE, true);
204

    
205

    
206
        String nomRefTitle = null;
207
        String nomRefDetail;
208
        String nomRefPupDate = null;
209
        String nomRefIssue = null;
210
        Partial pupDate = null;
211

    
212
        boolean restoreOriginalReference = false;
213
        boolean nameIsValid = true;
214

    
215
        // preprocess nomRef: separate citation, reference detail, publishing date
216
        if(!StringUtils.isEmpty(nomRefStr)){
217
            nomRefStr = nomRefStr.trim();
218

    
219
            // handle the special case which is hard to parse:
220
            //
221
            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
222
            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
223

    
224
                if(bookVariedadesTradicionales == null){
225
                    bookVariedadesTradicionales = ReferenceFactory.newBook();
226
                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
227
                    bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
228
                    getReferenceService().save(bookVariedadesTradicionales);
229
                }
230
                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
231
                restoreOriginalReference = true;
232
            }
233

    
234
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
235
            if(m.matches()){
236
                nomRefTitle = m.group("title");
237
                nomRefDetail = m.group("detail");
238
                nomRefPupDate = m.group("date").trim();
239
                nomRefIssue = m.group("issue");
240

    
241
                pupDate = parseDate(regNumber, nomRefPupDate);
242
                if (pupDate != null) {
243
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
244
                } else {
245
                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
246
                }
247
            } else {
248
                nomRefTitle = nomRefStr;
249
            }
250
        }
251

    
252
        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
253

    
254
        // always add the original strings of parsed data as annotation
255
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
256
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
257
                        "\n -  '" + TYPE + "': " + typeStr +
258
                        "\n -  '" + REGISTRATION  + "': " + regStr
259
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
260

    
261
        if(restoreOriginalReference){
262
            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
263
        }
264

    
265
        if(taxonName.getNomenclaturalReference() != null){
266
            if(pupDate != null) {
267
                taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
268
            }
269
            if(nomRefIssue != null) {
270
                ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
271
            }
272
        }
273

    
274

    
275
        if(!StringUtils.isEmpty(notesTxt)){
276
            notesTxt = notesTxt.replace("Notes: ", "").trim();
277
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
278
            nameIsValid = false;
279

    
280
        }
281
        if(!StringUtils.isEmpty(caveats)){
282
            caveats = caveats.replace("Caveats: ", "").trim();
283
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
284
            nameIsValid = false;
285
        }
286

    
287
        if(nameIsValid){
288
            // Status is always considered valid if no notes and cavets are set
289
            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
290
        }
291

    
292
        getNameService().save(taxonName);
293

    
294
        // Namerelations
295
        if(!StringUtils.isEmpty(authorsSpelling)){
296
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
297

    
298
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
299
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
300

    
301
            ArrayUtils.reverse(authorSpellingTokens);
302
            ArrayUtils.reverse(nameStrTokens);
303

    
304
            for (int i = 0; i < nameStrTokens.length; i++){
305
                if(i < authorSpellingTokens.length){
306
                    nameStrTokens[i] = authorSpellingTokens[i];
307
                }
308
            }
309
            ArrayUtils.reverse(nameStrTokens);
310

    
311
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
312
            // build the fullnameString of the misspelled name
313
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
314

    
315
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
316
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
317
            getNameService().save(misspelledName);
318
        }
319

    
320
        // Replaced Synonyms
321
        if(!StringUtils.isEmpty(fullSynSubstStr)){
322
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
323
            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
324
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
325
            getNameService().save(replacedSynonymName);
326
        }
327

    
328
        Reference sec = state.getConfig().getSecReference();
329
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
330

    
331
        // Basionym
332
        if(fullBasionymStr != null){
333
            fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
334
            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
335
            BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
336
            getNameService().save(basionym);
337
            taxonName.addBasionym(basionym);
338

    
339
            Synonym syn = Synonym.NewInstance(basionym, sec);
340
            taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
341
            getTaxonService().save(syn);
342
        }
343

    
344
        // Markers
345
        if(isFossil){
346
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
347
        }
348
        if(!nameSet.add(titleCacheStr)){
349
            taxonName.addMarker(Marker.NewInstance(markerDuplicateRegistration(), true));
350
            logger.warn(csvReportLine(regNumber, "Duplicate registration of", titleCacheStr));
351
        }
352

    
353

    
354
        // Types
355
        if(!StringUtils.isEmpty(typeStr)){
356

    
357
            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
358
                makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
359
            } else {
360
                makeNameTypeData(typeStr, taxonName, regNumber, state);
361
            }
362
        }
363

    
364
        getTaxonService().save(taxon);
365

    
366
        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
367
            // try to find the genus, it should have been imported already, Genera are coming first in the import file
368
            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
369
            if(genus != null){
370
                higherTaxonNode = genus.getTaxonNodes().iterator().next();
371
            } else {
372
                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
373
            }
374
        }
375

    
376
        if(higherTaxonNode != null){
377
            higherTaxonNode.addChildTaxon(taxon, null, null);
378
            getTaxonNodeService().save(higherTaxonNode);
379
        }
380

    
381
        if(taxonName.getRank().isGenus()){
382
            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
383
        }
384

    
385
        return taxon;
386
    }
387

    
388
    private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
389

    
390
        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
391

    
392
        if(m.matches()){
393
            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
394
            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
395
            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
396
            if(fieldUnit == null) {
397
                // create a field unit with only a titleCache using the fieldUnitStr substring
398
                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
399
                fieldUnit = FieldUnit.NewInstance();
400
                fieldUnit.setTitleCache(fieldUnitStr, true);
401
                getOccurrenceService().save(fieldUnit);
402
            }
403
            getOccurrenceService().save(fieldUnit);
404

    
405
            SpecimenOrObservationType specimenType;
406
            if(isFossil){
407
                specimenType = SpecimenOrObservationType.Fossil;
408
            } else {
409
                specimenType = SpecimenOrObservationType.PreservedSpecimen;
410
            }
411

    
412
            // all others ..
413
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
414
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
415

    
416
        } else {
417
            // create a field unit with only a titleCache using the full typeStr
418
            FieldUnit fieldUnit = FieldUnit.NewInstance();
419
            fieldUnit.setTitleCache(typeStr, true);
420
            getOccurrenceService().save(fieldUnit);
421
            logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
422
        }
423
        getNameService().save(taxonName);
424
    }
425

    
426
    private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
427

    
428
        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
429
        if(nameStr.isEmpty()) {
430
            return;
431
        }
432

    
433
        String basionymNameStr = null;
434
        String noteStr = null;
435
        String agentStr = null;
436

    
437
        Matcher m;
438

    
439
        if(typeStr.startsWith("not to be indicated")){
440
            // Special case:
441
            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
442
            // FIXME
443
            m = typeNameSpecialSplitPattern.matcher(nameStr);
444
            if(m.matches()){
445
                nameStr = m.group("name");
446
                noteStr = m.group("note");
447
                agentStr = m.group("agent");
448
                // TODO better import of agent?
449
                if(agentStr != null){
450
                    noteStr = noteStr + ": " + agentStr;
451
                }
452
            }
453
        } else {
454
            // Generic case
455
            m = typeNameBasionymPattern.matcher(nameStr);
456
            if (m.find()) {
457
                basionymNameStr = m.group("basionymName");
458
                if (basionymNameStr != null) {
459
                    nameStr = nameStr.replace(m.group(0), "");
460
                }
461
            }
462

    
463
            m = typeNameNotePattern.matcher(nameStr);
464
            if (m.find()) {
465
                noteStr = m.group(1);
466
                if (noteStr != null) {
467
                    nameStr = nameStr.replace(m.group(0), "");
468
                }
469
            }
470
        }
471

    
472
        BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
473

    
474
        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
475
            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
476
        }
477

    
478
        if(basionymNameStr != null){
479
            BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
480
            getNameService().save(basionymName);
481
            typeName.addBasionym(basionymName);
482
        }
483

    
484

    
485
        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
486
        nameTypeDesignation.setTypeName(typeName);
487
        getNameService().save(typeName);
488

    
489
        if(noteStr != null){
490
            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
491
        }
492
        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
493

    
494
    }
495

    
496
    /**
497
     * Currently only parses the collector, fieldNumber and the collection date.
498
     *
499
     * @param fieldUnitStr
500
     * @param regNumber
501
     * @param state
502
     * @return null if the fieldUnitStr could not be parsed
503
     */
504
    protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
505

    
506
        FieldUnit fieldUnit = null;
507

    
508
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
509
        if(m1.matches()){
510

    
511
            String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
512
            String removal = m1.group(1);
513
            if(collectorData == null){
514
                collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
515
                removal = m1.group(3);
516
            }
517
            if(collectorData == null){
518
                collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
519
                removal = null;
520
            }
521
            if(collectorData == null){
522
                return null;
523
            }
524

    
525
            // the fieldUnitStr is parsable
526
            // remove all collectorData from the fieldUnitStr and use the rest as locality
527
            String locality = null;
528
            if(removal != null){
529
                locality = fieldUnitStr.replace(removal, "");
530
            }
531

    
532
            String collectorStr = null;
533
            String detailStr = null;
534
            Partial date = null;
535
            String fieldNumber = null;
536

    
537
            Matcher m2 = collectionDataPattern.matcher(collectorData);
538
            if(m2.matches()){
539
                collectorStr = m2.group("collector");
540
                detailStr = m2.group("detail");
541

    
542
                // Try to make sense of the detailStr
543
                if(detailStr != null){
544
                    detailStr = detailStr.trim();
545
                    // 1. try to parse as date
546
                    date = parseDate(regNumber, detailStr);
547
                    if(date == null){
548
                        // 2. try to parse as number
549
                        if(collectorsNumber.matcher(detailStr).matches()){
550
                            fieldNumber = detailStr;
551
                        }
552
                    }
553
                }
554
                if(date == null && fieldNumber == null){
555
                    // detailed parsing not possible, so need fo fallback
556
                    collectorStr = collectorData;
557
                }
558
            }
559

    
560
            if(collectorStr == null) {
561
                collectorStr = collectorData;
562
            }
563

    
564
            fieldUnit = FieldUnit.NewInstance();
565
            GatheringEvent ge = GatheringEvent.NewInstance();
566
            if(locality != null){
567
                ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
568
            }
569

    
570
            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
571
            if(agent == null) {
572
                agent = Person.NewTitledInstance(collectorStr);
573
                getAgentService().save(agent);
574
                state.putAgentBase(collectorStr, agent);
575
            }
576
            ge.setCollector(agent);
577

    
578
            if(date != null){
579
                ge.setGatheringDate(date);
580
            }
581

    
582
            getEventBaseService().save(ge);
583
            fieldUnit.setGatheringEvent(ge);
584

    
585
            if(fieldNumber != null) {
586
                fieldUnit.setFieldNumber(fieldNumber);
587
            }
588
            getOccurrenceService().save(fieldUnit);
589

    
590
        }
591

    
592
        return fieldUnit;
593
    }
594

    
595
    protected Partial parseDate(String regNumber, String dateStr) {
596

    
597
        Partial pupDate = null;
598
        boolean parseError = false;
599

    
600
        String day = null;
601
        String month = null;
602
        String monthName = null;
603
        String year = null;
604

    
605
        for(Pattern p : datePatterns){
606
            Matcher m2 = p.matcher(dateStr);
607
            if(m2.matches()){
608
                try {
609
                    year = m2.group("year");
610
                } catch (IllegalArgumentException e){
611
                    // named capture group not found
612
                }
613
                try {
614
                    month = m2.group("month");
615
                } catch (IllegalArgumentException e){
616
                    // named capture group not found
617
                }
618

    
619
                try {
620
                    monthName = m2.group("monthName");
621
                    month = monthFromName(monthName, regNumber);
622
                    if(month == null){
623
                        parseError = true;
624
                    }
625
                } catch (IllegalArgumentException e){
626
                    // named capture group not found
627
                }
628
                try {
629
                    day = m2.group("day");
630
                } catch (IllegalArgumentException e){
631
                    // named capture group not found
632
                }
633

    
634
                if(year != null){
635
                    if (year.length() == 2) {
636
                        // it is an abbreviated year from the 19** years
637
                        year = "19" + year;
638
                    }
639
                    break;
640
                } else {
641
                    parseError = true;
642
                }
643
            }
644
        }
645
        if(year == null){
646
            parseError = true;
647
        }
648
        List<DateTimeFieldType> types = new ArrayList<>();
649
        List<Integer> values = new ArrayList<>();
650
        if(!parseError) {
651
            types.add(DateTimeFieldType.year());
652
            values.add(Integer.parseInt(year));
653
            if (month != null) {
654
                types.add(DateTimeFieldType.monthOfYear());
655
                values.add(Integer.parseInt(month));
656
            }
657
            if (day != null) {
658
                types.add(DateTimeFieldType.dayOfMonth());
659
                values.add(Integer.parseInt(day));
660
            }
661
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
662
        }
663
        return pupDate;
664
    }
665

    
666
    private String monthFromName(String monthName, String regNumber) {
667

    
668
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
669
        if(month == null){
670
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
671
            return null;
672
        } else {
673
            return month.toString();
674
        }
675
    }
676

    
677

    
678
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
679

    
680
        if(StringUtils.isEmpty(typeStr)){
681
            return;
682
        }
683
        typeStr = typeStr.trim().replaceAll("\\.$", "");
684

    
685
        Collection collection = null;
686
        DerivedUnit specimen = null;
687

    
688
        List<DerivedUnit> specimens = new ArrayList<>();
689
        if(multiple){
690
            String[] tokens = typeStr.split("\\s?,\\s?");
691
            for (String t : tokens) {
692
                // command to  list all complex parsabel types:
693
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
694
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
695

    
696
                if(!t.isEmpty()){
697
                    // trying to parse the string
698
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
699
                    if(specimen != null){
700
                        specimens.add(specimen);
701
                    } else {
702
                        // parsing was not successful make simple specimen
703
                        specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
704
                    }
705
                }
706
            }
707
        } else {
708
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
709
            if(specimen != null) {
710
                specimens.add(specimen);
711
                // remember current collection
712
                collection = specimen.getCollection();
713
            } else {
714
                // parsing was not successful make simple specimen
715
                specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
716
            }
717
        }
718

    
719
        for(DerivedUnit s : specimens){
720
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
721
       }
722
    }
723

    
724
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
725
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
726
        facade.setTitleCache(titleCache.trim(), true);
727
        specimen = facade.innerDerivedUnit();
728
        return specimen;
729
    }
730

    
731
    /**
732
     *
733
     * @param fieldUnit
734
     * @param typeName
735
     * @param collection
736
     * @param text
737
     * @param regNumber
738
     * @return
739
     */
740
    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
741

    
742
        DerivedUnit specimen = null;
743

    
744
        String collectionCode = null;
745
        String collectionTitle = null;
746
        String subCollectionStr = null;
747
        String instituteStr = null;
748
        String accessionNumber = null;
749

    
750
        boolean unusualAccessionNumber = false;
751

    
752
        text = text.trim();
753

    
754
        // 1.  For Isotypes often the accession number is noted alone if the
755
        //     preceeding entry has a collection code.
756
        if(typeName .equals(TypesName.isotype) && collection != null){
757
            Matcher m = accessionNumberOnlyPattern.matcher(text);
758
            if(m.matches()){
759
                try {
760
                    accessionNumber = m.group("accNumber");
761
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
762
                } catch (IllegalArgumentException e){
763
                    // match group acc_number not found
764
                }
765
            }
766
        }
767

    
768
        //2. try it the 'normal' way
769
        if(specimen == null) {
770
            for (Pattern p : specimenTypePatterns) {
771
                Matcher m = p.matcher(text);
772
                if (m.matches()) {
773
                    // collection code or collectionTitle is mandatory
774
                    try {
775
                        collectionCode = m.group("colCode");
776
                    } catch (IllegalArgumentException e){
777
                        // match group colCode not found
778
                    }
779

    
780
                    try {
781
                        instituteStr = m.group("institute");
782
                    } catch (IllegalArgumentException e){
783
                        // match group col_name not found
784
                    }
785

    
786
                    try {
787
                        subCollectionStr = m.group("subCollection");
788
                    } catch (IllegalArgumentException e){
789
                        // match group subCollection not found
790
                    }
791
                    try {
792
                        accessionNumber = m.group("accNumber");
793

    
794
                        // try to improve the accessionNumber
795
                        if(accessionNumber!= null) {
796
                            accessionNumber = accessionNumber.trim();
797
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
798
                            String betterAccessionNumber = null;
799
                            if (m2.matches()) {
800
                                try {
801
                                    betterAccessionNumber = m.group("accNumber");
802
                                } catch (IllegalArgumentException e) {
803
                                    // match group acc_number not found
804
                                }
805
                            }
806
                            if (betterAccessionNumber != null) {
807
                                accessionNumber = betterAccessionNumber;
808
                            } else {
809
                                unusualAccessionNumber = true;
810
                            }
811
                        }
812

    
813
                    } catch (IllegalArgumentException e){
814
                        // match group acc_number not found
815
                    }
816

    
817
                    if(collectionCode == null && instituteStr == null){
818
                        logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
819
                        continue;
820
                    }
821
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
822
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
823
                    break;
824
                }
825
            }
826
        }
827
        if(specimen == null) {
828
            logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
829
        }
830
        if(unusualAccessionNumber){
831
            logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
832
        }
833
        return specimen;
834
    }
835

    
836
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
837

    
838
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
839
        facade.setCollection(collection);
840
        if(accessionNumber != null){
841
            facade.setAccessionNumber(accessionNumber);
842
        }
843
        return facade.innerDerivedUnit();
844
    }
845

    
846
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
847
                                            String authorStr, String nomRefTitle) {
848

    
849
        BotanicalName taxonName;// cache field for the taxonName.titleCache
850
        String taxonNameTitleCache = null;
851
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
852

    
853
        // TitleCache preprocessing
854
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
855
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
856
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
857
            if(authorStr != null) {
858
                authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
859
            }
860
        }
861

    
862
        // parse the full taxon name
863
        if(!StringUtils.isEmpty(nomRefTitle)){
864
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
865
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
866
            logger.debug(":::::" + taxonFullNameStr);
867
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
868
        } else {
869
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
870
        }
871

    
872
        taxonNameTitleCache = taxonName.getTitleCache().trim();
873
        if (taxonName.isProtectedTitleCache()) {
874
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
875
        } else {
876

    
877
            boolean doRestoreTitleCacheStr = false;
878

    
879
            // Check if titleCache and nameCache are plausible
880
            String titleCacheCompareStr = titleCacheStr;
881
            String nameCache = taxonName.getNameCache();
882
            String nameCompareStr = nameStr;
883
            if(taxonName.isBinomHybrid()){
884
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
885
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
886
            }
887
            if(taxonName.isMonomHybrid()){
888
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
889
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
890
            }
891
            if(authorStr != null && authorStr.contains(" et ")){
892
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
893
            }
894
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
895
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
896
                doRestoreTitleCacheStr = true;
897
            }
898
            if (!nameCache.trim().equals(nameCompareStr)) {
899
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
900
            }
901

    
902
            //  Author
903
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
904
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
905
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
906
            //    doRestoreTitleCacheStr = true;
907
            //}
908

    
909
            if(doRestoreTitleCacheStr){
910
                taxonName.setTitleCache(titleCacheStr, true);
911
            }
912

    
913
            // deduplicate
914
            replaceAuthorNamesAndNomRef(state, taxonName);
915
        }
916

    
917
        // Annotations
918
        if(!nameAnnotations.isEmpty()){
919
            for(String text : nameAnnotations.keySet()){
920
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
921
            }
922
        }
923

    
924
        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
925

    
926
        getNameService().save(taxonName);
927

    
928
        return taxonName;
929
    }
930

    
931
    /**
932
     * @param state
933
     * @return
934
     */
935
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
936

    
937
     //   Classification classification = state.getClassification();
938
     //   if (classification == null){
939
     //       IAPTImportConfigurator config = state.getConfig();
940
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
941
     //       classification.setUuid(config.getClassificationUuid());
942
     //       classification.setReference(config.getSecReference());
943
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
944
     //   }
945
        TaxonNode rootNode = state.getRootNode();
946
        if (rootNode == null){
947
            rootNode = getTaxonNodeService().find(ROOT_UUID);
948
        }
949
        if (rootNode == null){
950
            Classification classification = state.getClassification();
951
            if (classification == null){
952
                Reference sec = state.getSecReference();
953
                String classificationName = state.getConfig().getClassificationName();
954
                Language language = Language.DEFAULT();
955
                classification = Classification.NewInstance(classificationName, sec, language);
956
                state.setClassification(classification);
957
                classification.setUuid(state.getConfig().getClassificationUuid());
958
                classification.getRootNode().setUuid(ROOT_UUID);
959
                getClassificationService().save(classification);
960
            }
961
            rootNode = classification.getRootNode();
962
            state.setRootNode(rootNode);
963
        }
964
        return rootNode;
965
    }
966

    
967
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
968

    
969
        Collection superCollection = null;
970
        if(subCollectionStr != null){
971
            superCollection = getCollection(collectionCode, instituteStr, null);
972
            collectionCode = subCollectionStr;
973
            instituteStr = null;
974
        }
975

    
976
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
977

    
978
        Collection collection = collectionMap.get(key);
979

    
980
        if(collection == null) {
981
            collection = Collection.NewInstance();
982
            collection.setCode(collectionCode);
983
            if(instituteStr != null){
984
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
985
            }
986
            if(superCollection != null){
987
                collection.setSuperCollection(superCollection);
988
            }
989
            collectionMap.put(key, collection);
990
            if(!_testMode) {
991
                getCollectionService().save(collection);
992
            }
993
        }
994

    
995
        return collection;
996
    }
997

    
998

    
999
    /**
1000
     * @param record
1001
     * @param originalKey
1002
     * @param doUnescapeHtmlEntities
1003
     * @return
1004
     */
1005
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
1006
        String value = record.get(originalKey);
1007

    
1008
        value = fixCharacters(value);
1009

    
1010
        if (! StringUtils.isBlank(value)) {
1011
        	if (logger.isDebugEnabled()) {
1012
        	    logger.debug(originalKey + ": " + value);
1013
        	}
1014
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1015
            if(doUnescapeHtmlEntities){
1016
                value = StringEscapeUtils.unescapeHtml(value);
1017
            }
1018
        	return value.trim();
1019
        }else{
1020
        	return null;
1021
        }
1022
    }
1023

    
1024
    /**
1025
     * Fixes broken characters.
1026
     * For details see
1027
     * http://dev.e-taxonomy.eu/redmine/issues/6035
1028
     *
1029
     * @param value
1030
     * @return
1031
     */
1032
    private String fixCharacters(String value) {
1033

    
1034
        value = StringUtils.replace(value, "s$K", "š");
1035
        value = StringUtils.replace(value, "n$K", "ň");
1036
        value = StringUtils.replace(value, "e$K", "ě");
1037
        value = StringUtils.replace(value, "r$K", "ř");
1038
        value = StringUtils.replace(value, "c$K", "č");
1039
        value = StringUtils.replace(value, "z$K", "ž");
1040
        value = StringUtils.replace(value, "S>U$K", "Š");
1041
        value = StringUtils.replace(value, "C>U$K", "Č");
1042
        value = StringUtils.replace(value, "R>U$K", "Ř");
1043
        value = StringUtils.replace(value, "Z>U$K", "Ž");
1044
        value = StringUtils.replace(value, "g$K", "ǧ");
1045
        value = StringUtils.replace(value, "s$A", "ś");
1046
        value = StringUtils.replace(value, "n$A", "ń");
1047
        value = StringUtils.replace(value, "c$A", "ć");
1048
        value = StringUtils.replace(value, "e$E", "ę");
1049
        value = StringUtils.replace(value, "o$H", "õ");
1050
        value = StringUtils.replace(value, "s$C", "ş");
1051
        value = StringUtils.replace(value, "t$C", "ț");
1052
        value = StringUtils.replace(value, "S>U$C", "Ş");
1053
        value = StringUtils.replace(value, "a$O", "å");
1054
        value = StringUtils.replace(value, "A>U$O", "Å");
1055
        value = StringUtils.replace(value, "u$O", "ů");
1056
        value = StringUtils.replace(value, "g$B", "ğ");
1057
        value = StringUtils.replace(value, "g$B", "ĕ");
1058
        value = StringUtils.replace(value, "a$B", "ă");
1059
        value = StringUtils.replace(value, "l$/", "ł");
1060
        value = StringUtils.replace(value, ">i", "ı");
1061
        value = StringUtils.replace(value, "i$U", "ï");
1062
        // Special-cases
1063
        value = StringUtils.replace(value, "&yacute", "ý");
1064
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1065
        value = StringUtils.replace(value, "E>U$D", "З");
1066
        value = StringUtils.replace(value, "S>U$E", "Ş");
1067
        value = StringUtils.replace(value, "s$E", "ş");
1068

    
1069
        value = StringUtils.replace(value, "c$k", "č");
1070
        value = StringUtils.replace(value, " U$K", " Š");
1071

    
1072
        value = StringUtils.replace(value, "O>U>!", "Ø");
1073
        value = StringUtils.replace(value, "o>!", "ø");
1074
        value = StringUtils.replace(value, "S$K", "Ŝ");
1075
        value = StringUtils.replace(value, ">l", "ğ");
1076

    
1077
        value = StringUtils.replace(value, "§B>i", "ł");
1078

    
1079

    
1080

    
1081
        return value;
1082
    }
1083

    
1084

    
1085
    /**
1086
	 *  Stores taxa records in DB
1087
	 */
1088
	@Override
1089
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1090

    
1091
        if(excludeFromImport(state)){
1092
            return;
1093
        }
1094

    
1095
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
1096
        logger.setLevel(Level.DEBUG);
1097
        HashMap<String, String> record = state.getOriginalRecord();
1098
        logger.debug(lineNumber + record.toString());
1099

    
1100
        Set<String> keys = record.keySet();
1101
        for (String key: keys) {
1102
            if (! expectedKeys.contains(key)){
1103
                logger.warn(lineNumber + "Unexpected Key: " + key);
1104
            }
1105
        }
1106

    
1107
        String reg_id = record.get(REGISTRATIONNO_PK);
1108

    
1109
        //higherTaxon
1110
        String higherTaxaString = record.get(HIGHERTAXON);
1111
        boolean isFossil = false;
1112
        if(higherTaxaString.startsWith("FOSSIL ")){
1113
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1114
            isFossil = true;
1115
        }
1116
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1117

    
1118
       //Taxon
1119
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1120
        if (taxon == null){
1121
            logger.warn(lineNumber + "taxon could not be created and is null");
1122
            return;
1123
        }
1124
        ((IAPTImportState)state).setCurrentTaxon(taxon);
1125

    
1126
        // Registration
1127
        IAPTRegData regData = makeIAPTRegData(state);
1128
        ObjectMapper mapper = new ObjectMapper();
1129
        try {
1130
            String regdataJson = mapper.writeValueAsString(regData);
1131
            Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1132
            getNameService().save(taxon.getName());
1133
        } catch (JsonProcessingException e) {
1134
            logger.error("Error on converting IAPTRegData", e);
1135
        }
1136

    
1137
        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1138
		return;
1139
    }
1140

    
1141
    private boolean excludeFromImport(SimpleExcelTaxonImportState<CONFIG> state) {
1142
        boolean include = false;
1143
        if(state.getConfig().isDoAlgeaeOnly()){
1144
            String higherTaxon = getValue(state.getOriginalRecord(), HIGHERTAXON, true);
1145
            String fullNameStr = getValue(state.getOriginalRecord(), FULLNAME, true);
1146
            include |= higherTaxon.matches(".*?PHYCEAE(?:$|\\s+)");
1147
            for(String test : new String[]{
1148
                    "Bolidophyceae ",
1149
                    "Phaeothamniophyceae ",
1150
                    "Bolidomonadales ",
1151
                    "Bolidomonadaceae ",
1152
                    "Aureoumbra ",
1153
                    "Bolidomonas ",
1154
                    "Seagriefia ",
1155
                    "Navicula "
1156
                })
1157
            include |= fullNameStr.startsWith(test);
1158
        }
1159

    
1160
        return !include;
1161
    }
1162

    
1163
    private ExtensionType getExtensionTypeIAPTRegData() {
1164
        if(extensionTypeIAPTRegData == null){
1165
            extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1166
            getTermService().save(extensionTypeIAPTRegData);
1167
        }
1168
        return extensionTypeIAPTRegData;
1169
    }
1170

    
1171
    private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1172

    
1173
        HashMap<String, String> record = state.getOriginalRecord();
1174
        String registrationStr = getValue(record, REGISTRATION);
1175
        String regDateStr = getValue(record, REGDATE);
1176
        String regStr = getValue(record, REGISTRATION, true);
1177

    
1178
        String dateStr = null;
1179
        String office = null;
1180
        Integer regID = null;
1181
        Integer formNo = null;
1182

    
1183
        Matcher m = registrationPattern.matcher(registrationStr);
1184
        if(m.matches()){
1185
            dateStr = m.group("regdate");
1186
            if(parseDate( regStr, dateStr) == null){
1187
                // check for valid dates
1188
                logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1189
            };
1190
            office = m.group("office");
1191
            regID = Integer.valueOf(m.group("regid"));
1192
            try {
1193
                formNo = Integer.valueOf(m.group("formNo"));
1194
            } catch(IllegalArgumentException e){
1195
                // ignore
1196
            }
1197
        } else {
1198
            logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1199
        }
1200
        IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1201
        return regData;
1202
    }
1203

    
1204
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1205
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1206
        TaxonNode higherTaxonNode = null;
1207

    
1208
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
1209
        for (String htn :  higherTaxaNames) {
1210
            htn = StringUtils.capitalize(htn.trim());
1211
            Taxon higherTaxon = state.getHigherTaxon(htn);
1212
            if (higherTaxon != null){
1213
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1214
            }else{
1215
                BotanicalName name = makeHigherTaxonName(state, htn);
1216
                Reference sec = state.getSecReference();
1217
                higherTaxon = Taxon.NewInstance(name, sec);
1218
                getTaxonService().save(higherTaxon);
1219
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1220
                state.putHigherTaxon(htn, higherTaxon);
1221
                getClassificationService().saveTreeNode(higherTaxonNode);
1222
            }
1223
            rootNode = higherTaxonNode;
1224
        }
1225
        return higherTaxonNode;
1226
    }
1227

    
1228
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1229

    
1230
        Rank rank = guessRank(name);
1231

    
1232
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
1233
        taxonName.addSource(makeOriginalSource(state));
1234
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1235
        return taxonName;
1236
    }
1237

    
1238
    private Rank guessRank(String name) {
1239

    
1240
        // normalize
1241
        name = name.replaceAll("\\(.*\\)", "").trim();
1242

    
1243
        if(name.matches("^Plantae$|^Fungi$")){
1244
           return Rank.KINGDOM();
1245
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1246
           return rankFamilyIncertisSedis();
1247
        } else if(name.matches(".*phyta$|.*mycota$")){
1248
           return Rank.PHYLUM();
1249
        } else if(name.matches(".*phytina$|.*mycotina$")){
1250
           return Rank.SUBPHYLUM();
1251
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1252
            return rankUnrankedSupraGeneric();
1253
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1254
           return Rank.CLASS();
1255
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1256
           return Rank.SUBCLASS();
1257
        } else if(name.matches(".*ales$")){
1258
           return Rank.ORDER();
1259
        } else if(name.matches(".*ineae$")){
1260
           return Rank.SUBORDER();
1261
        } else if(name.matches(".*aceae$")){
1262
            return Rank.FAMILY();
1263
        } else if(name.matches(".*oideae$")){
1264
           return Rank.SUBFAMILY();
1265
        } else
1266
        //    if(name.matches(".*eae$")){
1267
        //    return Rank.TRIBE();
1268
        // } else
1269
            if(name.matches(".*inae$")){
1270
           return Rank.SUBTRIBE();
1271
        } else if(name.matches(".*ae$")){
1272
           return Rank.FAMILY();
1273
        }
1274
        return Rank.UNKNOWN_RANK();
1275
    }
1276

    
1277
    private Rank rankUnrankedSupraGeneric() {
1278

    
1279
        if(rankUnrankedSupraGeneric == null){
1280
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1281
            getTermService().save(rankUnrankedSupraGeneric);
1282
        }
1283
        return rankUnrankedSupraGeneric;
1284
    }
1285

    
1286
    private Rank rankFamilyIncertisSedis() {
1287

    
1288
        if(familyIncertisSedis == null){
1289
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1290
            getTermService().save(familyIncertisSedis);
1291
        }
1292
        return familyIncertisSedis;
1293
    }
1294

    
1295
    private AnnotationType annotationTypeCaveats(){
1296
        if(annotationTypeCaveats == null){
1297
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1298
            getTermService().save(annotationTypeCaveats);
1299
        }
1300
        return annotationTypeCaveats;
1301
    }
1302

    
1303

    
1304
    /**
1305
     * @param state
1306
     * @return
1307
     */
1308
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1309
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1310
    }
1311

    
1312

    
1313
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1314
        Reference ref = state.getReference(uuidRef);
1315
        if (ref == null){
1316
            ref = getReferenceService().find(uuidRef);
1317
            state.putReference(uuidRef, ref);
1318
        }
1319
        return ref;
1320
    }
1321

    
1322
    private MarkerType markerTypeFossil(){
1323
        if(this.markerTypeFossil == null){
1324
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1325
            getTermService().save(this.markerTypeFossil);
1326
        }
1327
        return markerTypeFossil;
1328
    }
1329

    
1330
    private MarkerType markerDuplicateRegistration(){
1331
        if(this.duplicateRegistration == null){
1332
            duplicateRegistration = MarkerType.NewInstance("duplicateRegistration", "duplicateRegistration", null);
1333
            getTermService().save(this.duplicateRegistration);
1334
        }
1335
        return markerTypeFossil;
1336
    }
1337

    
1338
    private String csvReportLine(String regId, String message, String ... fields){
1339
        StringBuilder out = new StringBuilder("regID#");
1340
        out.append(regId).append(",\"").append(message).append('"');
1341

    
1342
        for(String f : fields){
1343
            out.append(",\"").append(f).append('"');
1344
        }
1345
        return out.toString();
1346
    }
1347

    
1348

    
1349
}
(1-1/5)