Project

General

Profile

Download (14.8 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.common.CdmUtils;
13
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
15
import eu.etaxonomy.cdm.model.common.*;
16
import eu.etaxonomy.cdm.model.name.*;
17
import eu.etaxonomy.cdm.model.reference.Reference;
18
import eu.etaxonomy.cdm.model.taxon.*;
19
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
20
import org.apache.commons.lang.StringEscapeUtils;
21
import org.apache.commons.lang.StringUtils;
22
import org.apache.log4j.Level;
23
import org.apache.log4j.Logger;
24
import org.springframework.stereotype.Component;
25

    
26
import java.util.*;
27
import java.util.regex.Matcher;
28
import java.util.regex.Pattern;
29

    
30
/**
31
 * @author a.mueller
32
 * @created 05.01.2016
33
 */
34

    
35
@Component("iAPTExcelImport")
36
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
37
    private static final long serialVersionUID = -747486709409732371L;
38
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
39
    public static final String ANNOTATION_MARKER_STRING = "[*]";
40

    
41

    
42
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
43

    
44
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
45

    
46
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
47
    private final static String HIGHERTAXON= "HigherTaxon";
48
    private final static String FULLNAME= "FullName";
49
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
50
    private final static String LITSTRING= "LitString";
51
    private final static String REGISTRATION= "Registration";
52
    private final static String TYPE= "Type";
53
    private final static String CAVEATS= "Caveats";
54
    private final static String FULLBASIONYM= "FullBasionym";
55
    private final static String FULLSYNSUBST= "FullSynSubst";
56
    private final static String NOTESTXT= "NotesTxt";
57
    private final static String REGDATE= "RegDate";
58
    private final static String NAMESTRING= "NameString";
59
    private final static String BASIONYMSTRING= "BasionymString";
60
    private final static String SYNSUBSTSTR= "SynSubstStr";
61
    private final static String AUTHORSTRING= "AuthorString";
62

    
63
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
64
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
65

    
66
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$");
67
    private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999
68

    
69
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
70
                            TaxonNode higherTaxonNode, boolean isSynonym) {
71

    
72
        String line = state.getCurrentLine() + ": ";
73

    
74
        String titleCacheStr = getValue(record, FULLNAME, true);
75
        String nameStr = getValue(record, NAMESTRING, true);
76
        String authorStr = getValue(record, AUTHORSTRING, true);
77
        String nomRefStr = getValue(record, LITSTRING, true);
78

    
79
        String nomRefTitle = null;
80
        String nomRefDetail = null;
81
        String nomRefPupDate = null;
82
        String nomRefPupYear = null;
83

    
84
        // preprocess nomRef: separate citation, reference detail, publishing date
85
        if(!StringUtils.isEmpty(nomRefStr)){
86
            nomRefStr = nomRefStr.trim();
87
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
88
            if(m.matches()){
89
                nomRefTitle = m.group(1);
90
                nomRefDetail = m.group(2);
91
                nomRefPupDate = m.group(3);
92

    
93
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
94
                Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate);
95
                if(m2.matches()){
96
                    nomRefPupYear = m2.group(2);
97
                    if(nomRefPupYear.length() == 2 ){
98
                        // it is an abbreviated year from the 19** years
99
                        nomRefPupYear = "19" + nomRefPupYear;
100
                    }
101
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
102
                } else {
103
                    logger.warn("Pub year not found in " + nomRefStr );
104
                }
105

    
106
            } else {
107
                nomRefTitle = nomRefStr;
108
            }
109
        }
110

    
111

    
112
        BotanicalName taxonName;
113
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
114

    
115
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) && authorStr.endsWith(ANNOTATION_MARKER_STRING)){
116
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
117
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
118
            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
119
        }
120

    
121
        if(!StringUtils.isEmpty(nomRefTitle)){
122
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
123
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
124
            logger.debug(":::::" + taxonFullNameStr);
125
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
126
        } else {
127
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
128
        }
129

    
130
        if (taxonName.isProtectedTitleCache()) {
131
            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
132
        } else {
133

    
134
            boolean doRestoreTitleCacheStr = false;
135
            // Check titleCache
136
            String generatedTitleCache = taxonName.getTitleCache();
137
            if (!generatedTitleCache.trim().equals(titleCacheStr)) {
138
                logger.warn(line + "The generated titleCache differs from the imported string : " + generatedTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
139
                doRestoreTitleCacheStr = true;
140
            }
141
            // Check Name
142
            if (!taxonName.getNameCache().trim().equals(nameStr)) {
143
                logger.warn(line + "parsed nameCache differs from " + NAMESTRING + " : " + taxonName.getNameCache() + " <> " + nameStr);
144
            }
145

    
146
            //  Author
147
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
148
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
149
            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
150
            //    doRestoreTitleCacheStr = true;
151
            //}
152

    
153
            if(doRestoreTitleCacheStr){
154
                taxonName.setTitleCache(titleCacheStr, true);
155
            }
156

    
157
            // deduplicate
158
            replaceAuthorNamesAndNomRef(state, taxonName);
159

    
160
            // Annotations
161
            if(!nameAnnotations.isEmpty()){
162
                for(String text : nameAnnotations.keySet()){
163
                    taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
164
                }
165
                getNameService().save(taxonName);
166
            }
167
        }
168

    
169
        Reference sec = state.getConfig().getSecReference();
170
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
171
        getTaxonService().save(taxon);
172
        if(higherTaxonNode != null){
173
            higherTaxonNode.addChildTaxon(taxon, null, null);
174
            getTaxonNodeService().save(higherTaxonNode);
175
        }
176

    
177
        return taxon;
178

    
179
    }
180

    
181
    /**
182
     * @param state
183
     * @return
184
     */
185
    private TaxonNode getClassification(IAPTImportState state) {
186

    
187
        Classification classification = state.getClassification();
188
        if (classification == null){
189
            IAPTImportConfigurator config = state.getConfig();
190
            classification = Classification.NewInstance(state.getConfig().getClassificationName());
191
            classification.setUuid(config.getClassificationUuid());
192
            classification.setReference(config.getSecReference());
193
            classification = getClassificationService().find(state.getConfig().getClassificationUuid());
194
        }
195
        TaxonNode rootNode = state.getRootNode();
196
        if (rootNode == null){
197
            rootNode = getTaxonNodeService().find(ROOT_UUID);
198
        }
199
        if (rootNode == null){
200
            Reference sec = state.getSecReference();
201
            if (classification == null){
202
                String classificationName = state.getConfig().getClassificationName();
203
                //TODO
204
                Language language = Language.DEFAULT();
205
                classification = Classification.NewInstance(classificationName, sec, language);
206
                state.setClassification(classification);
207
                classification.setUuid(state.getConfig().getClassificationUuid());
208
                classification.getRootNode().setUuid(ROOT_UUID);
209
            }
210

    
211
            getClassificationService().save(classification);
212
            rootNode = classification.getRootNode();
213
        }
214
        return rootNode;
215
    }
216

    
217

    
218
    /**
219
     * @param record
220
     * @param originalKey
221
     * @param doUnescapeHtmlEntities
222
     * @return
223
     */
224
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
225
        String value = record.get(originalKey);
226
        if (! StringUtils.isBlank(value)) {
227
        	if (logger.isDebugEnabled()) {
228
        	    logger.debug(originalKey + ": " + value);
229
        	}
230
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
231
            if(doUnescapeHtmlEntities){
232
                value = StringEscapeUtils.unescapeHtml(value);
233
            }
234
        	return value.trim();
235
        }else{
236
        	return null;
237
        }
238
    }
239

    
240

    
241

    
242
	/**
243
	 *  Stores taxa records in DB
244
	 */
245
	@Override
246
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
247

    
248
	    boolean isSynonymOnly = false;
249

    
250
        String line = state.getCurrentLine() + ": ";
251
        logger.setLevel(Level.DEBUG);
252
        HashMap<String, String> record = state.getOriginalRecord();
253
        logger.debug(record.toString());
254

    
255
        Set<String> keys = record.keySet();
256
        for (String key: keys) {
257
            if (! expectedKeys.contains(key)){
258
                logger.warn(line + "Unexpected Key: " + key);
259
            }
260
        }
261

    
262
        String reg_id = record.get(REGISTRATIONNO_PK);
263
        //higherTaxon
264
        TaxonNode higherTaxon = getHigherTaxon(record, (IAPTImportState)state);
265

    
266
       //Taxon
267
        Taxon taxon = makeTaxon(record, state, higherTaxon, isSynonymOnly);
268
        if (taxon == null && ! isSynonymOnly){
269
            logger.warn(line + "taxon could not be created and is null");
270
            return;
271
        }
272
        ((IAPTImportState)state).setCurrentTaxon(taxon);
273

    
274
        //(Notas)
275
        //makeNotes(record, state);
276

    
277
        //Syn.
278
        //makeSynonyms(record, state, !isSynonymOnly);
279

    
280

    
281
		return;
282
    }
283

    
284
    private TaxonNode getHigherTaxon(HashMap<String, String> record, IAPTImportState state) {
285
        String higherTaxaString = record.get(HIGHERTAXON);
286
        // higherTaxaString is like
287
        // - DICOTYLEDONES: LEGUMINOSAE: MIMOSOIDEAE
288
        // - FOSSIL DICOTYLEDONES: PROTEACEAE
289
        // - [fungi]
290
        // - [no group assigned]
291
        if(higherTaxaString.equals("[no group assigned]")){
292
            return null;
293
        }
294
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
295
        TaxonNode higherTaxonNode = null;
296

    
297
        ITaxonTreeNode rootNode = getClassification(state);
298
        for (String htn :  higherTaxaNames) {
299
            htn = StringUtils.capitalize(htn.trim());
300
            Taxon higherTaxon = state.getHigherTaxon(htn);
301
            if (higherTaxon != null){
302
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
303
            }else{
304
                BotanicalName name = makeHigherTaxonName(state, htn);
305
                Reference sec = state.getSecReference();
306
                higherTaxon = Taxon.NewInstance(name, sec);
307
                getTaxonService().save(higherTaxon);
308
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
309
                state.putHigherTaxon(htn, higherTaxon);
310
                getClassificationService().saveTreeNode(higherTaxonNode);
311
            }
312
            rootNode = higherTaxonNode;
313
        }
314
        return higherTaxonNode;
315
    }
316

    
317
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
318

    
319
        Rank rank = guessRank(name);
320

    
321
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
322
        taxonName.addSource(makeOriginalSource(state));
323
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
324
        return taxonName;
325
    }
326

    
327
    private Rank guessRank(String name) {
328

    
329
        // normalize
330
        name = name.replaceAll("\\(.*\\)", "").trim();
331

    
332
        if(name.matches("^Plantae$|^Fungi$|^Musci$")){
333
           return Rank.KINGDOM();
334
        } else if(name.matches(".*incertae sedis$|^Fossil no group assigned$")){
335
           return Rank.FAMILY();
336
        } else if(name.matches(".*phyta$|.*mycota$")){
337
           return Rank.SECTION_BOTANY();
338
        } else if(name.matches(".*phytina$|.*mycotina$")){
339
           return Rank.SUBSECTION_BOTANY();
340
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$")){
341
           return Rank.CLASS();
342
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
343
           return Rank.SUBCLASS();
344
        } else if(name.matches(".*ales$")){
345
           return Rank.ORDER();
346
        } else if(name.matches(".*ineae$")){
347
           return Rank.SUBORDER();
348
        } else if(name.matches(".*oideae$")){
349
           return Rank.SUBFAMILY();
350
        } else if(name.matches(".*eae$")){
351
           return Rank.TRIBE();
352
        } else if(name.matches(".*inae$")){
353
           return Rank.SUBTRIBE();
354
        } else if(name.matches(".*ae$")){
355
           return Rank.FAMILY();
356
        }
357
        return Rank.UNKNOWN_RANK();
358
    }
359

    
360

    
361
    /**
362
     * @param state
363
     * @return
364
     */
365
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
366
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
367
    }
368

    
369

    
370
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
371
        Reference ref = state.getReference(uuidRef);
372
        if (ref == null){
373
            ref = getReferenceService().find(uuidRef);
374
            state.putReference(uuidRef, ref);
375
        }
376
        return ref;
377
    }
378

    
379

    
380

    
381
}
(1-1/4)