Project

General

Profile

Download (31.3 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13
import eu.etaxonomy.cdm.common.CdmUtils;
14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16
import eu.etaxonomy.cdm.model.common.*;
17
import eu.etaxonomy.cdm.model.name.*;
18
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
19
import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
20
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
21
import eu.etaxonomy.cdm.model.reference.Reference;
22
import eu.etaxonomy.cdm.model.taxon.*;
23
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
24
import org.apache.commons.lang.ArrayUtils;
25
import org.apache.commons.lang.StringEscapeUtils;
26
import org.apache.commons.lang.StringUtils;
27
import org.apache.log4j.Level;
28
import org.apache.log4j.Logger;
29
import org.joda.time.DateTimeFieldType;
30
import org.joda.time.Partial;
31
import org.springframework.stereotype.Component;
32

    
33
import java.util.*;
34
import java.util.regex.Matcher;
35
import java.util.regex.Pattern;
36

    
37
/**
38
 * @author a.mueller
39
 * @created 05.01.2016
40
 */
41

    
42
@Component("iAPTExcelImport")
43
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
44
    private static final long serialVersionUID = -747486709409732371L;
45
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
46
    public static final String ANNOTATION_MARKER_STRING = "[*]";
47

    
48

    
49
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
50

    
51
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
52

    
53
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
54
    private final static String HIGHERTAXON= "HigherTaxon";
55
    private final static String FULLNAME= "FullName";
56
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
57
    private final static String LITSTRING= "LitString";
58
    private final static String REGISTRATION= "Registration";
59
    private final static String TYPE= "Type";
60
    private final static String CAVEATS= "Caveats";
61
    private final static String FULLBASIONYM= "FullBasionym";
62
    private final static String FULLSYNSUBST= "FullSynSubst";
63
    private final static String NOTESTXT= "NotesTxt";
64
    private final static String REGDATE= "RegDate";
65
    private final static String NAMESTRING= "NameString";
66
    private final static String BASIONYMSTRING= "BasionymString";
67
    private final static String SYNSUBSTSTR= "SynSubstStr";
68
    private final static String AUTHORSTRING= "AuthorString";
69

    
70
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
71
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
72

    
73
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
74
    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
75
            // NOTE:
76
            // The order of the patterns is extremely important!!!
77
            //
78
            // all patterns cover the years 1700 - 1999
79
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
80
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
81
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
82
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
83
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
84
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
85
        };
86
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
87

    
88
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
89
    static {
90
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
91
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
92
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
93
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
94
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
95
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
96
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
97
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
98
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
99
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
100

    
101
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
102

    
103
        for (String[] months: perLang) {
104
            for(int m = 1; m < 13; m++){
105
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
106
            }
107
        }
108

    
109
        // special cases
110
        monthFromNameMap.put("mar", 3);
111
        monthFromNameMap.put("dec", 12);
112
        monthFromNameMap.put("Februari", 2);
113
    }
114

    
115
    enum TypesName {
116
        type, holotype, isotype;
117

    
118
        public SpecimenTypeDesignationStatus status(){
119
            switch (this) {
120
                case holotype:
121
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
122
                case isotype:
123
                    return SpecimenTypeDesignationStatus.ISOTYPE();
124
                default:
125
                    return null;
126
            }
127
        }
128
    }
129

    
130
    private MarkerType markerTypeFossil = null;
131
    private Rank rankUnrankedSupraGeneric = null;
132
    private Rank familyIncertisSedis = null;
133
    private AnnotationType annotationTypeCaveats = null;
134

    
135
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
136
                            TaxonNode higherTaxonNode, boolean isFossil) {
137

    
138
        String line = state.getCurrentLine() + ": ";
139

    
140
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
141
        String titleCacheStr = getValue(record, FULLNAME, true);
142
        String nameStr = getValue(record, NAMESTRING, true);
143
        String authorStr = getValue(record, AUTHORSTRING, true);
144
        String nomRefStr = getValue(record, LITSTRING, true);
145
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
146
        String notesTxt = getValue(record, NOTESTXT, true);
147
        String caveats = getValue(record, CAVEATS, true);
148
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
149
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
150
        String typeStr = getValue(record, TYPE, true);
151

    
152
        String nomRefTitle = null;
153
        String nomRefDetail = null;
154
        String nomRefPupDate = null;
155
        String nomRefPupDay = null;
156
        String nomRefPupMonth = null;
157
        String nomRefPupMonthName = null;
158
        String nomRefPupYear = null;
159

    
160
        // preprocess nomRef: separate citation, reference detail, publishing date
161
        if(!StringUtils.isEmpty(nomRefStr)){
162
            nomRefStr = nomRefStr.trim();
163
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
164
            if(m.matches()){
165
                nomRefTitle = m.group(1);
166
                nomRefDetail = m.group(2);
167
                nomRefPupDate = m.group(3).trim();
168

    
169
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
170
                for(Pattern p : nomRefPubDatePs){
171
                    Matcher m2 = p.matcher(nomRefPupDate);
172
                    if(m2.matches()){
173
                        try {
174
                            nomRefPupYear = m2.group("year");
175
                        } catch (IllegalArgumentException e){
176
                            // named capture group not found
177
                        }
178
                        try {
179
                            nomRefPupMonth = m2.group("month");
180
                        } catch (IllegalArgumentException e){
181
                            // named capture group not found
182
                        }
183
                        try {
184
                            nomRefPupMonthName = m2.group("monthName");
185
                            nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
186
                        } catch (IllegalArgumentException e){
187
                            // named capture group not found
188
                        }
189
                        try {
190
                            nomRefPupDay = m2.group("day");
191
                        } catch (IllegalArgumentException e){
192
                            // named capture group not found
193
                        }
194

    
195
                        if(nomRefPupYear == null){
196
                            logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
197
                        }
198
                        if(nomRefPupYear.length() == 2 ){
199
                            // it is an abbreviated year from the 19** years
200
                            nomRefPupYear = "19" + nomRefPupYear;
201
                        }
202
                        nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
203
                        break;
204
                    }
205
                }
206
                if(nomRefPupYear == null){
207
                    logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
208
                }
209
                List<DateTimeFieldType> types = new ArrayList<>();
210
                List<Integer> values = new ArrayList<>();
211
                if(nomRefPupYear != null){
212
                    types.add(DateTimeFieldType.year());
213
                    values.add(Integer.parseInt(nomRefPupYear));
214
                }
215
                if(nomRefPupMonth != null){
216
                    types.add(DateTimeFieldType.monthOfYear());
217
                    values.add(Integer.parseInt(nomRefPupMonth));
218
                }
219
                if(nomRefPupDay != null){
220
                    types.add(DateTimeFieldType.dayOfMonth());
221
                    values.add(Integer.parseInt(nomRefPupDay));
222
                }
223
                Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
224

    
225
            } else {
226
                nomRefTitle = nomRefStr;
227
            }
228
        }
229

    
230
        BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle);
231

    
232
        if(!StringUtils.isEmpty(notesTxt)){
233
            notesTxt = notesTxt.replace("Notes: ", "").trim();
234
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
235
        }
236
        if(!StringUtils.isEmpty(caveats)){
237
            caveats = caveats.replace("Caveats: ", "").trim();
238
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
239
        }
240
        //
241

    
242
        // Namerelations
243
        if(!StringUtils.isEmpty(authorsSpelling)){
244
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
245

    
246
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
247
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
248

    
249
            ArrayUtils.reverse(authorSpellingTokens);
250
            ArrayUtils.reverse(nameStrTokens);
251

    
252
            for (int i = 0; i < nameStrTokens.length; i++){
253
                if(i < authorSpellingTokens.length){
254
                    nameStrTokens[i] = authorSpellingTokens[i];
255
                }
256
            }
257
            ArrayUtils.reverse(nameStrTokens);
258

    
259
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
260
            // build the fullnameString of the misspelled name
261
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
262

    
263
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
264
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
265
            getNameService().save(misspelledName);
266
        }
267

    
268
        // Replaced Synonyms
269
        if(!StringUtils.isEmpty(fullSynSubstStr)){
270
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
271
            BotanicalName replacedSynonymName = makeBotanicalName(state, fullSynSubstStr, synSubstStr, null, null);
272
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
273
            getNameService().save(replacedSynonymName);
274
        }
275

    
276
        Reference sec = state.getConfig().getSecReference();
277
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
278

    
279
        // Markers
280
        if(isFossil){
281
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
282
        }
283

    
284
        // Types
285
        if(!StringUtils.isEmpty(typeStr)){
286
            Matcher m = typeSplitPattern.matcher(typeStr);
287

    
288
            if(m.matches()){
289
                String typeString = m.group(TypesName.type.name());
290
                boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
291

    
292
                if(isFieldUnit) {
293
                    // type as fieldUnit
294
                    FieldUnit fu = FieldUnit.NewInstance();
295
                    fu.setTitleCache(typeString, true);
296
                    getOccurrenceService().save(fu);
297

    
298
                    // all others ..
299
                    addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false);
300
                    addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true);
301
                } else {
302
                    TaxonNameBase typeName = nameParser.parseFullName(typeString);
303
                    taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
304
                }
305
            }
306
            getNameService().save(taxonName);
307

    
308
        }
309

    
310
        getTaxonService().save(taxon);
311
        if(higherTaxonNode != null){
312
            higherTaxonNode.addChildTaxon(taxon, null, null);
313
            getTaxonNodeService().save(higherTaxonNode);
314
        }
315

    
316
        return taxon;
317

    
318
    }
319

    
320
    private String monthFromName(String monthName, String regNumber) {
321

    
322
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
323
        if(month == null){
324
            logger.warn("Unknown month [" + regNumber + "]: " + monthName + " (" + monthName.toLowerCase() + ")");
325
            return null;
326
        } else {
327
            return month.toString();
328
        }
329
    }
330

    
331

    
332
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple){
333
        if(StringUtils.isEmpty(typeStr)){
334
            return;
335
        }
336
        typeStr = typeStr.trim().replaceAll("\\.$", "");
337

    
338
        List<String> typeData = new ArrayList<>();
339
        if(multiple){
340
            String[] tokens = typeStr.split("\\s?,\\s?");
341
            for (String t : tokens) {
342
                if(!t.isEmpty()){
343
                    typeData.add(t.trim());
344
                }
345
            }
346
        } else {
347
            typeData.add(typeStr.trim());
348
        }
349

    
350
        for(String type : typeData){
351
            DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.OtherSpecimen, fieldUnit);
352
            facade.setTitleCache(type, true);
353
            DerivedUnit specimen = facade.innerDerivedUnit();
354
            taxonName.addSpecimenTypeDesignation(specimen, typeName.status(), null, null, null, false, true);
355
       }
356
    }
357

    
358
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr,
359
                                            String authorStr, String nomRefTitle) {
360

    
361
        BotanicalName taxonName;// cache field for the taxonName.titleCache
362
        String taxonNameTitleCache = null;
363
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
364

    
365
        String line = state.getCurrentLine() + ": ";
366

    
367
        // TitleCache preprocessing
368
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
369
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
370
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
371
            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
372
        }
373

    
374
        // parse the full taxon name
375
        if(!StringUtils.isEmpty(nomRefTitle)){
376
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
377
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
378
            logger.debug(":::::" + taxonFullNameStr);
379
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
380
        } else {
381
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
382
        }
383

    
384
        taxonNameTitleCache = taxonName.getTitleCache().trim();
385
        if (taxonName.isProtectedTitleCache()) {
386
            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
387
        } else {
388

    
389
            boolean doRestoreTitleCacheStr = false;
390

    
391
            // Check if titleCache and nameCache are plausible
392
            String titleCacheCompareStr = titleCacheStr;
393
            String nameCache = taxonName.getNameCache();
394
            String nameCompareStr = nameStr;
395
            if(taxonName.isBinomHybrid()){
396
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
397
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
398
            }
399
            if(taxonName.isMonomHybrid()){
400
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
401
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
402
            }
403
            if(authorStr != null && authorStr.contains(" et ")){
404
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
405
            }
406
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
407
                logger.warn(line + "The generated titleCache differs from the imported string : " + taxonNameTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
408
                doRestoreTitleCacheStr = true;
409
            }
410
            if (!nameCache.trim().equals(nameCompareStr)) {
411
                logger.warn(line + "The parsed nameCache differs from " + NAMESTRING + " : " + nameCache + " <> " + nameCompareStr);
412
            }
413

    
414
            //  Author
415
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
416
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
417
            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
418
            //    doRestoreTitleCacheStr = true;
419
            //}
420

    
421
            if(doRestoreTitleCacheStr){
422
                taxonName.setTitleCache(titleCacheStr, true);
423
            }
424

    
425
            // deduplicate
426
            replaceAuthorNamesAndNomRef(state, taxonName);
427
        }
428

    
429
        // Annotations
430
        if(!nameAnnotations.isEmpty()){
431
            for(String text : nameAnnotations.keySet()){
432
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
433
            }
434
            getNameService().save(taxonName);
435
        }
436
        return taxonName;
437
    }
438

    
439
    /**
440
     * @param state
441
     * @return
442
     */
443
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
444

    
445
     //   Classification classification = state.getClassification();
446
     //   if (classification == null){
447
     //       IAPTImportConfigurator config = state.getConfig();
448
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
449
     //       classification.setUuid(config.getClassificationUuid());
450
     //       classification.setReference(config.getSecReference());
451
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
452
     //   }
453
        TaxonNode rootNode = state.getRootNode();
454
        if (rootNode == null){
455
            rootNode = getTaxonNodeService().find(ROOT_UUID);
456
        }
457
        if (rootNode == null){
458
            Classification classification = state.getClassification();
459
            if (classification == null){
460
                Reference sec = state.getSecReference();
461
                String classificationName = state.getConfig().getClassificationName();
462
                Language language = Language.DEFAULT();
463
                classification = Classification.NewInstance(classificationName, sec, language);
464
                state.setClassification(classification);
465
                classification.setUuid(state.getConfig().getClassificationUuid());
466
                classification.getRootNode().setUuid(ROOT_UUID);
467
                getClassificationService().save(classification);
468
            }
469
            rootNode = classification.getRootNode();
470
            state.setRootNode(rootNode);
471
        }
472
        return rootNode;
473
    }
474

    
475

    
476
    /**
477
     * @param record
478
     * @param originalKey
479
     * @param doUnescapeHtmlEntities
480
     * @return
481
     */
482
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
483
        String value = record.get(originalKey);
484

    
485
        value = fixCharacters(value);
486

    
487
        if (! StringUtils.isBlank(value)) {
488
        	if (logger.isDebugEnabled()) {
489
        	    logger.debug(originalKey + ": " + value);
490
        	}
491
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
492
            if(doUnescapeHtmlEntities){
493
                value = StringEscapeUtils.unescapeHtml(value);
494
            }
495
        	return value.trim();
496
        }else{
497
        	return null;
498
        }
499
    }
500

    
501
    /**
502
     * Fixes broken characters.
503
     * For details see
504
     * http://dev.e-taxonomy.eu/redmine/issues/6035
505
     *
506
     * @param value
507
     * @return
508
     */
509
    private String fixCharacters(String value) {
510

    
511
        value = StringUtils.replace(value, "s$K", "š");
512
        value = StringUtils.replace(value, "n$K", "ň");
513
        value = StringUtils.replace(value, "e$K", "ě");
514
        value = StringUtils.replace(value, "r$K", "ř");
515
        value = StringUtils.replace(value, "c$K", "č");
516
        value = StringUtils.replace(value, "z$K", "ž");
517
        value = StringUtils.replace(value, "S>U$K", "Š");
518
        value = StringUtils.replace(value, "C>U$K", "Č");
519
        value = StringUtils.replace(value, "R>U$K", "Ř");
520
        value = StringUtils.replace(value, "Z>U$K", "Ž");
521
        value = StringUtils.replace(value, "g$K", "ǧ");
522
        value = StringUtils.replace(value, "s$A", "ś");
523
        value = StringUtils.replace(value, "n$A", "ń");
524
        value = StringUtils.replace(value, "c$A", "ć");
525
        value = StringUtils.replace(value, "e$E", "ę");
526
        value = StringUtils.replace(value, "o$H", "õ");
527
        value = StringUtils.replace(value, "s$C", "ş");
528
        value = StringUtils.replace(value, "t$C", "ț");
529
        value = StringUtils.replace(value, "S>U$C", "Ş");
530
        value = StringUtils.replace(value, "a$O", "å");
531
        value = StringUtils.replace(value, "A>U$O", "Å");
532
        value = StringUtils.replace(value, "u$O", "ů");
533
        value = StringUtils.replace(value, "g$B", "ğ");
534
        value = StringUtils.replace(value, "g$B", "ĕ");
535
        value = StringUtils.replace(value, "a$B", "ă");
536
        value = StringUtils.replace(value, "l$/", "ł");
537
        value = StringUtils.replace(value, ">i", "ı");
538
        value = StringUtils.replace(value, "i$U", "ï");
539
        // Special-cases
540
        value = StringUtils.replace(value, "&yacute", "ý");
541
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
542
        value = StringUtils.replace(value, "E>U$D", "З");
543
        value = StringUtils.replace(value, "S>U$E", "Ş");
544
        value = StringUtils.replace(value, "s$E", "ş");
545

    
546
        value = StringUtils.replace(value, "c$k", "č");
547
        value = StringUtils.replace(value, " U$K", " Š");
548

    
549
        return value;
550
    }
551

    
552

    
553
    /**
554
	 *  Stores taxa records in DB
555
	 */
556
	@Override
557
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
558

    
559
        String lineNumber = state.getCurrentLine() + ": ";
560
        logger.setLevel(Level.DEBUG);
561
        HashMap<String, String> record = state.getOriginalRecord();
562
        logger.debug(lineNumber + record.toString());
563

    
564
        Set<String> keys = record.keySet();
565
        for (String key: keys) {
566
            if (! expectedKeys.contains(key)){
567
                logger.warn(lineNumber + "Unexpected Key: " + key);
568
            }
569
        }
570

    
571
        String reg_id = record.get(REGISTRATIONNO_PK);
572

    
573
        //higherTaxon
574
        String higherTaxaString = record.get(HIGHERTAXON);
575
        boolean isFossil = false;
576
        if(higherTaxaString.startsWith("FOSSIL ")){
577
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
578
            isFossil = true;
579
        }
580
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
581

    
582
       //Taxon
583
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
584
        if (taxon == null){
585
            logger.warn(lineNumber + "taxon could not be created and is null");
586
            return;
587
        }
588
        ((IAPTImportState)state).setCurrentTaxon(taxon);
589

    
590

    
591
		return;
592
    }
593

    
594
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
595
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
596
        TaxonNode higherTaxonNode = null;
597

    
598
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
599
        for (String htn :  higherTaxaNames) {
600
            htn = StringUtils.capitalize(htn.trim());
601
            Taxon higherTaxon = state.getHigherTaxon(htn);
602
            if (higherTaxon != null){
603
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
604
            }else{
605
                BotanicalName name = makeHigherTaxonName(state, htn);
606
                Reference sec = state.getSecReference();
607
                higherTaxon = Taxon.NewInstance(name, sec);
608
                getTaxonService().save(higherTaxon);
609
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
610
                state.putHigherTaxon(htn, higherTaxon);
611
                getClassificationService().saveTreeNode(higherTaxonNode);
612
            }
613
            rootNode = higherTaxonNode;
614
        }
615
        return higherTaxonNode;
616
    }
617

    
618
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
619

    
620
        Rank rank = guessRank(name);
621

    
622
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
623
        taxonName.addSource(makeOriginalSource(state));
624
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
625
        return taxonName;
626
    }
627

    
628
    private Rank guessRank(String name) {
629

    
630
        // normalize
631
        name = name.replaceAll("\\(.*\\)", "").trim();
632

    
633
        if(name.matches("^Plantae$|^Fungi$")){
634
           return Rank.KINGDOM();
635
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
636
           return rankFamilyIncertisSedis();
637
        } else if(name.matches(".*phyta$|.*mycota$")){
638
           return Rank.SECTION_BOTANY();
639
        } else if(name.matches(".*phytina$|.*mycotina$")){
640
           return Rank.SUBSECTION_BOTANY();
641
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
642
            return rankUnrankedSupraGeneric();
643
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
644
           return Rank.CLASS();
645
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
646
           return Rank.SUBCLASS();
647
        } else if(name.matches(".*ales$")){
648
           return Rank.ORDER();
649
        } else if(name.matches(".*ineae$")){
650
           return Rank.SUBORDER();
651
        } else if(name.matches(".*aceae$")){
652
            return Rank.FAMILY();
653
        } else if(name.matches(".*oideae$")){
654
           return Rank.SUBFAMILY();
655
        } else
656
        //    if(name.matches(".*eae$")){
657
        //    return Rank.TRIBE();
658
        // } else
659
            if(name.matches(".*inae$")){
660
           return Rank.SUBTRIBE();
661
        } else if(name.matches(".*ae$")){
662
           return Rank.FAMILY();
663
        }
664
        return Rank.UNKNOWN_RANK();
665
    }
666

    
667
    private Rank rankUnrankedSupraGeneric() {
668

    
669
        if(rankUnrankedSupraGeneric == null){
670
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
671
            getTermService().save(rankUnrankedSupraGeneric);
672
        }
673
        return rankUnrankedSupraGeneric;
674
    }
675

    
676
    private Rank rankFamilyIncertisSedis() {
677

    
678
        if(familyIncertisSedis == null){
679
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
680
            getTermService().save(familyIncertisSedis);
681
        }
682
        return familyIncertisSedis;
683
    }
684

    
685
    private AnnotationType annotationTypeCaveats(){
686
        if(annotationTypeCaveats == null){
687
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
688
            getTermService().save(annotationTypeCaveats);
689
        }
690
        return annotationTypeCaveats;
691
    }
692

    
693

    
694
    /**
695
     * @param state
696
     * @return
697
     */
698
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
699
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
700
    }
701

    
702

    
703
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
704
        Reference ref = state.getReference(uuidRef);
705
        if (ref == null){
706
            ref = getReferenceService().find(uuidRef);
707
            state.putReference(uuidRef, ref);
708
        }
709
        return ref;
710
    }
711

    
712
    private MarkerType markerTypeFossil(){
713
        if(this.markerTypeFossil == null){
714
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
715
            getTermService().save(this.markerTypeFossil);
716
        }
717
        return markerTypeFossil;
718
    }
719

    
720

    
721
}
(1-1/4)