Project

General

Profile

Download (20 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.common.CdmUtils;
13
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
15
import eu.etaxonomy.cdm.model.common.*;
16
import eu.etaxonomy.cdm.model.name.*;
17
import eu.etaxonomy.cdm.model.reference.Reference;
18
import eu.etaxonomy.cdm.model.taxon.*;
19
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
20
import org.apache.commons.lang.ArrayUtils;
21
import org.apache.commons.lang.StringEscapeUtils;
22
import org.apache.commons.lang.StringUtils;
23
import org.apache.log4j.Level;
24
import org.apache.log4j.Logger;
25
import org.springframework.stereotype.Component;
26

    
27
import java.util.*;
28
import java.util.regex.Matcher;
29
import java.util.regex.Pattern;
30

    
31
/**
32
 * @author a.mueller
33
 * @created 05.01.2016
34
 */
35

    
36
@Component("iAPTExcelImport")
37
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
38
    private static final long serialVersionUID = -747486709409732371L;
39
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
40
    public static final String ANNOTATION_MARKER_STRING = "[*]";
41

    
42

    
43
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
44

    
45
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
46

    
47
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
48
    private final static String HIGHERTAXON= "HigherTaxon";
49
    private final static String FULLNAME= "FullName";
50
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
51
    private final static String LITSTRING= "LitString";
52
    private final static String REGISTRATION= "Registration";
53
    private final static String TYPE= "Type";
54
    private final static String CAVEATS= "Caveats";
55
    private final static String FULLBASIONYM= "FullBasionym";
56
    private final static String FULLSYNSUBST= "FullSynSubst";
57
    private final static String NOTESTXT= "NotesTxt";
58
    private final static String REGDATE= "RegDate";
59
    private final static String NAMESTRING= "NameString";
60
    private final static String BASIONYMSTRING= "BasionymString";
61
    private final static String SYNSUBSTSTR= "SynSubstStr";
62
    private final static String AUTHORSTRING= "AuthorString";
63

    
64
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
65
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
66

    
67
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$");
68
    private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999
69

    
70
    private MarkerType markerTypeFossil = null;
71
    private Rank rankUnrankedSupraGeneric = null;
72
    private Rank familyIncertisSedis = null;
73
    private AnnotationType annotationTypeCaveats = null;
74

    
75
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
76
                            TaxonNode higherTaxonNode, boolean isSynonym, boolean isFossil) {
77

    
78
        String line = state.getCurrentLine() + ": ";
79

    
80
        String titleCacheStr = getValue(record, FULLNAME, true);
81
        String nameStr = getValue(record, NAMESTRING, true);
82
        String authorStr = getValue(record, AUTHORSTRING, true);
83
        String nomRefStr = getValue(record, LITSTRING, true);
84
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
85
        String notesTxt = getValue(record, NOTESTXT, true);
86
        String caveats = getValue(record, CAVEATS, true);
87

    
88
        String nomRefTitle = null;
89
        String nomRefDetail = null;
90
        String nomRefPupDate = null;
91
        String nomRefPupYear = null;
92

    
93
        // preprocess nomRef: separate citation, reference detail, publishing date
94
        if(!StringUtils.isEmpty(nomRefStr)){
95
            nomRefStr = nomRefStr.trim();
96
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
97
            if(m.matches()){
98
                nomRefTitle = m.group(1);
99
                nomRefDetail = m.group(2);
100
                nomRefPupDate = m.group(3);
101

    
102
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
103
                Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate);
104
                if(m2.matches()){
105
                    nomRefPupYear = m2.group(2);
106
                    if(nomRefPupYear == null){
107
                        nomRefPupYear = m2.group(4);
108
                    }
109
                    if(nomRefPupYear == null){
110
                        logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
111
                    }
112
                    if(nomRefPupYear.length() == 2 ){
113
                        // it is an abbreviated year from the 19** years
114
                        nomRefPupYear = "19" + nomRefPupYear;
115
                    }
116
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
117
                } else {
118
                    logger.warn("Pub year not found in " + nomRefStr );
119
                    // FIXME in in J. Eur. Orchideen 30: 128. 30.09.97 (Vorabdr.).
120

    
121
                }
122

    
123
            } else {
124
                nomRefTitle = nomRefStr;
125
            }
126
        }
127

    
128
        BotanicalName taxonName;
129
        // cache field for the taxonName.titleCache
130
        String taxonNameTitleCache = null;
131
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
132

    
133
        // TitleCache preprocessing
134
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
135
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
136
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
137
            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
138
        }
139

    
140
        // parse the full taxon name
141
        if(!StringUtils.isEmpty(nomRefTitle)){
142
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
143
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
144
            logger.debug(":::::" + taxonFullNameStr);
145
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
146
        } else {
147
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
148
        }
149

    
150
        taxonNameTitleCache = taxonName.getTitleCache().trim();
151
        if (taxonName.isProtectedTitleCache()) {
152
            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
153
        } else {
154

    
155
            boolean doRestoreTitleCacheStr = false;
156

    
157
            // Check if titleCache and nameCache are plausible
158
            String titleCacheCompareStr = titleCacheStr;
159
            String nameCache = taxonName.getNameCache();
160
            String nameCompareStr = nameStr;
161
            if(taxonName.isBinomHybrid()){
162
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
163
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
164
            }
165
            if(taxonName.isMonomHybrid()){
166
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
167
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
168
            }
169
            if(authorStr.contains(" et ")){
170
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
171
            }
172
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
173
                logger.warn(line + "The generated titleCache differs from the imported string : " + taxonNameTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
174
                doRestoreTitleCacheStr = true;
175
            }
176
            if (!nameCache.trim().equals(nameCompareStr)) {
177
                logger.warn(line + "The parsed nameCache differs from " + NAMESTRING + " : " + nameCache + " <> " + nameCompareStr);
178
            }
179

    
180
            //  Author
181
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
182
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
183
            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
184
            //    doRestoreTitleCacheStr = true;
185
            //}
186

    
187
            if(doRestoreTitleCacheStr){
188
                taxonName.setTitleCache(titleCacheStr, true);
189
            }
190

    
191
            // deduplicate
192
            replaceAuthorNamesAndNomRef(state, taxonName);
193
        }
194

    
195
        // Annotations
196
        if(!nameAnnotations.isEmpty()){
197
            for(String text : nameAnnotations.keySet()){
198
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
199
            }
200
            getNameService().save(taxonName);
201
        }
202
        if(!StringUtils.isEmpty(notesTxt)){
203
            notesTxt = notesTxt.replace("Notes: ", "").trim();
204
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
205
        }
206
        if(!StringUtils.isEmpty(caveats)){
207
            caveats = caveats.replace("Caveats: ", "").trim();
208
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
209
        }
210
        //
211

    
212
        // Namerelations
213
        if(!StringUtils.isEmpty(authorsSpelling)){
214
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
215

    
216
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
217
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
218

    
219
            ArrayUtils.reverse(authorSpellingTokens);
220
            ArrayUtils.reverse(nameStrTokens);
221

    
222
            for (int i = 0; i < nameStrTokens.length; i++){
223
                if(i < authorSpellingTokens.length){
224
                    nameStrTokens[i] = authorSpellingTokens[i];
225
                }
226
            }
227
            ArrayUtils.reverse(nameStrTokens);
228

    
229
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
230
            // build the fullnameString of the misspelled name
231
            misspelledNameStr = taxonNameTitleCache.replace(nameStr, misspelledNameStr);
232

    
233
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
234
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
235
            getNameService().save(misspelledName);
236
        }
237

    
238
        Reference sec = state.getConfig().getSecReference();
239
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
240

    
241
        // Markers
242
        if(isFossil){
243
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
244
        }
245

    
246
        getTaxonService().save(taxon);
247
        if(higherTaxonNode != null){
248
            higherTaxonNode.addChildTaxon(taxon, null, null);
249
            getTaxonNodeService().save(higherTaxonNode);
250
        }
251

    
252
        return taxon;
253

    
254
    }
255

    
256
    /**
257
     * @param state
258
     * @return
259
     */
260
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
261

    
262
     //   Classification classification = state.getClassification();
263
     //   if (classification == null){
264
     //       IAPTImportConfigurator config = state.getConfig();
265
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
266
     //       classification.setUuid(config.getClassificationUuid());
267
     //       classification.setReference(config.getSecReference());
268
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
269
     //   }
270
        TaxonNode rootNode = state.getRootNode();
271
        if (rootNode == null){
272
            rootNode = getTaxonNodeService().find(ROOT_UUID);
273
        }
274
        if (rootNode == null){
275
            Classification classification = state.getClassification();
276
            if (classification == null){
277
                Reference sec = state.getSecReference();
278
                String classificationName = state.getConfig().getClassificationName();
279
                Language language = Language.DEFAULT();
280
                classification = Classification.NewInstance(classificationName, sec, language);
281
                state.setClassification(classification);
282
                classification.setUuid(state.getConfig().getClassificationUuid());
283
                classification.getRootNode().setUuid(ROOT_UUID);
284
                getClassificationService().save(classification);
285
            }
286
            rootNode = classification.getRootNode();
287
            state.setRootNode(rootNode);
288
        }
289
        return rootNode;
290
    }
291

    
292

    
293
    /**
294
     * @param record
295
     * @param originalKey
296
     * @param doUnescapeHtmlEntities
297
     * @return
298
     */
299
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
300
        String value = record.get(originalKey);
301
        if (! StringUtils.isBlank(value)) {
302
        	if (logger.isDebugEnabled()) {
303
        	    logger.debug(originalKey + ": " + value);
304
        	}
305
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
306
            if(doUnescapeHtmlEntities){
307
                value = StringEscapeUtils.unescapeHtml(value);
308
            }
309
        	return value.trim();
310
        }else{
311
        	return null;
312
        }
313
    }
314

    
315

    
316

    
317
	/**
318
	 *  Stores taxa records in DB
319
	 */
320
	@Override
321
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
322

    
323
	    boolean isSynonymOnly = false;
324

    
325
        String lineNumber = state.getCurrentLine() + ": ";
326
        logger.setLevel(Level.DEBUG);
327
        HashMap<String, String> record = state.getOriginalRecord();
328
        logger.debug(lineNumber + record.toString());
329

    
330
        Set<String> keys = record.keySet();
331
        for (String key: keys) {
332
            if (! expectedKeys.contains(key)){
333
                logger.warn(lineNumber + "Unexpected Key: " + key);
334
            }
335
        }
336

    
337
        String reg_id = record.get(REGISTRATIONNO_PK);
338

    
339
        //higherTaxon
340
        String higherTaxaString = record.get(HIGHERTAXON);
341
        boolean isFossil = false;
342
        if(higherTaxaString.startsWith("FOSSIL ")){
343
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
344
            isFossil = true;
345
        }
346
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
347

    
348
       //Taxon
349
        Taxon taxon = makeTaxon(record, state, higherTaxon, isSynonymOnly, isFossil);
350
        if (taxon == null && ! isSynonymOnly){
351
            logger.warn(lineNumber + "taxon could not be created and is null");
352
            return;
353
        }
354
        ((IAPTImportState)state).setCurrentTaxon(taxon);
355

    
356
        //Syn.
357
        //makeSynonyms(record, state, !isSynonymOnly);
358

    
359

    
360
		return;
361
    }
362

    
363
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
364

    
365
        // higherTaxaString is like
366
        // - DICOTYLEDONES: LEGUMINOSAE: MIMOSOIDEAE
367
        // - FOSSIL DICOTYLEDONES: PROTEACEAE
368
        // - [fungi]
369
        // - [no group assigned]
370
        if(higherTaxaString.equals("[no group assigned]")){
371
            return null;
372
        }
373
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
374
        TaxonNode higherTaxonNode = null;
375

    
376
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
377
        for (String htn :  higherTaxaNames) {
378
            htn = StringUtils.capitalize(htn.trim());
379
            Taxon higherTaxon = state.getHigherTaxon(htn);
380
            if (higherTaxon != null){
381
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
382
            }else{
383
                BotanicalName name = makeHigherTaxonName(state, htn);
384
                Reference sec = state.getSecReference();
385
                higherTaxon = Taxon.NewInstance(name, sec);
386
                getTaxonService().save(higherTaxon);
387
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
388
                state.putHigherTaxon(htn, higherTaxon);
389
                getClassificationService().saveTreeNode(higherTaxonNode);
390
            }
391
            rootNode = higherTaxonNode;
392
        }
393
        return higherTaxonNode;
394
    }
395

    
396
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
397

    
398
        Rank rank = guessRank(name);
399

    
400
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
401
        taxonName.addSource(makeOriginalSource(state));
402
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
403
        return taxonName;
404
    }
405

    
406
    private Rank guessRank(String name) {
407

    
408
        // normalize
409
        name = name.replaceAll("\\(.*\\)", "").trim();
410

    
411
        if(name.matches("^Plantae$|^Fungi$")){
412
           return Rank.KINGDOM();
413
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
414
           return rankFamilyIncertisSedis();
415
        } else if(name.matches(".*phyta$|.*mycota$")){
416
           return Rank.SECTION_BOTANY();
417
        } else if(name.matches(".*phytina$|.*mycotina$")){
418
           return Rank.SUBSECTION_BOTANY();
419
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
420
            return rankUnrankedSupraGeneric();
421
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
422
           return Rank.CLASS();
423
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
424
           return Rank.SUBCLASS();
425
        } else if(name.matches(".*ales$")){
426
           return Rank.ORDER();
427
        } else if(name.matches(".*ineae$")){
428
           return Rank.SUBORDER();
429
        } else if(name.matches(".*aceae$")){
430
            return Rank.FAMILY();
431
        } else if(name.matches(".*oideae$")){
432
           return Rank.SUBFAMILY();
433
        } else
434
        //    if(name.matches(".*eae$")){
435
        //    return Rank.TRIBE();
436
        // } else
437
            if(name.matches(".*inae$")){
438
           return Rank.SUBTRIBE();
439
        } else if(name.matches(".*ae$")){
440
           return Rank.FAMILY();
441
        }
442
        return Rank.UNKNOWN_RANK();
443
    }
444

    
445
    private Rank rankUnrankedSupraGeneric() {
446

    
447
        if(rankUnrankedSupraGeneric == null){
448
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
449
            getTermService().save(rankUnrankedSupraGeneric);
450
        }
451
        return rankUnrankedSupraGeneric;
452
    }
453

    
454
    private Rank rankFamilyIncertisSedis() {
455

    
456
        if(familyIncertisSedis == null){
457
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
458
            getTermService().save(familyIncertisSedis);
459
        }
460
        return familyIncertisSedis;
461
    }
462

    
463
    private AnnotationType annotationTypeCaveats(){
464
        if(annotationTypeCaveats == null){
465
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
466
            getTermService().save(annotationTypeCaveats);
467
        }
468
        return annotationTypeCaveats;
469
    }
470

    
471

    
472
    /**
473
     * @param state
474
     * @return
475
     */
476
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
477
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
478
    }
479

    
480

    
481
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
482
        Reference ref = state.getReference(uuidRef);
483
        if (ref == null){
484
            ref = getReferenceService().find(uuidRef);
485
            state.putReference(uuidRef, ref);
486
        }
487
        return ref;
488
    }
489

    
490
    private MarkerType markerTypeFossil(){
491
        if(this.markerTypeFossil == null){
492
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
493
            getTermService().save(this.markerTypeFossil);
494
        }
495
        return markerTypeFossil;
496
    }
497

    
498

    
499

    
500
}
(1-1/4)