Project

General

Profile

Download (40.9 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.iapt;
11

    
12
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13
import eu.etaxonomy.cdm.common.CdmUtils;
14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16
import eu.etaxonomy.cdm.model.agent.Institution;
17
import eu.etaxonomy.cdm.model.common.*;
18
import eu.etaxonomy.cdm.model.name.*;
19
import eu.etaxonomy.cdm.model.occurrence.*;
20
import eu.etaxonomy.cdm.model.occurrence.Collection;
21
import eu.etaxonomy.cdm.model.reference.Reference;
22
import eu.etaxonomy.cdm.model.taxon.*;
23
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
24
import org.apache.commons.lang.ArrayUtils;
25
import org.apache.commons.lang.StringEscapeUtils;
26
import org.apache.commons.lang.StringUtils;
27
import org.apache.log4j.Level;
28
import org.apache.log4j.Logger;
29
import org.joda.time.DateTimeFieldType;
30
import org.joda.time.Partial;
31
import org.joda.time.format.DateTimeFormat;
32
import org.joda.time.format.DateTimeFormatter;
33
import org.springframework.stereotype.Component;
34

    
35
import java.util.*;
36
import java.util.regex.Matcher;
37
import java.util.regex.Pattern;
38

    
39
/**
40
 * @author a.mueller
41
 * @created 05.01.2016
42
 */
43

    
44
@Component("iAPTExcelImport")
45
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
46
    private static final long serialVersionUID = -747486709409732371L;
47
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
48
    public static final String ANNOTATION_MARKER_STRING = "[*]";
49

    
50

    
51
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
52

    
53
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
54

    
55
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
56
    private final static String HIGHERTAXON= "HigherTaxon";
57
    private final static String FULLNAME= "FullName";
58
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
59
    private final static String LITSTRING= "LitString";
60
    private final static String REGISTRATION= "Registration";
61
    private final static String TYPE= "Type";
62
    private final static String CAVEATS= "Caveats";
63
    private final static String FULLBASIONYM= "FullBasionym";
64
    private final static String FULLSYNSUBST= "FullSynSubst";
65
    private final static String NOTESTXT= "NotesTxt";
66
    private final static String REGDATE= "RegDate";
67
    private final static String NAMESTRING= "NameString";
68
    private final static String BASIONYMSTRING= "BasionymString";
69
    private final static String SYNSUBSTSTR= "SynSubstStr";
70
    private final static String AUTHORSTRING= "AuthorString";
71

    
72
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
73
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
74

    
75
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
76
    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
77
            // NOTE:
78
            // The order of the patterns is extremely important!!!
79
            //
80
            // all patterns cover the years 1700 - 1999
81
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
82
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
83
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
84
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
85
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
86
            Pattern.compile("^(?:(?<day>[0-9]{1,2})\\sde\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
87
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
88
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
89
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
90
        };
91
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
92

    
93
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
94
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
95

    
96
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
97
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
98
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
99
            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*)\\2(?<accNumber>.*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
100
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
101
    };
102

    
103
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
104

    
105
    static {
106
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
107
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
108
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
109
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
110
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
111
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
112
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
113
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
114
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
115
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
116

    
117
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
118

    
119
        for (String[] months: perLang) {
120
            for(int m = 1; m < 13; m++){
121
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
122
            }
123
        }
124

    
125
        // special cases
126
        monthFromNameMap.put("mar", 3);
127
        monthFromNameMap.put("dec", 12);
128
        monthFromNameMap.put("Februari", 2);
129
    }
130

    
131

    
132
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
133

    
134
    private Map<String, Collection> collectionMap = new HashMap<>();
135

    
136

    
137
    enum TypesName {
138
        type, holotype, isotype;
139

    
140
        public SpecimenTypeDesignationStatus status(){
141
            switch (this) {
142
                case holotype:
143
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
144
                case isotype:
145
                    return SpecimenTypeDesignationStatus.ISOTYPE();
146
                default:
147
                    return null;
148
            }
149
        }
150
    }
151

    
152
    private MarkerType markerTypeFossil = null;
153
    private Rank rankUnrankedSupraGeneric = null;
154
    private Rank familyIncertisSedis = null;
155
    private AnnotationType annotationTypeCaveats = null;
156

    
157
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
158
                            TaxonNode higherTaxonNode, boolean isFossil) {
159

    
160
        String line = state.getCurrentLine() + ": ";
161

    
162
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
163
        String regStr = getValue(record, REGISTRATION, true);
164
        String titleCacheStr = getValue(record, FULLNAME, true);
165
        String nameStr = getValue(record, NAMESTRING, true);
166
        String authorStr = getValue(record, AUTHORSTRING, true);
167
        String nomRefStr = getValue(record, LITSTRING, true);
168
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
169
        String notesTxt = getValue(record, NOTESTXT, true);
170
        String caveats = getValue(record, CAVEATS, true);
171
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
172
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
173
        String typeStr = getValue(record, TYPE, true);
174

    
175

    
176
        String nomRefTitle = null;
177
        String nomRefDetail;
178
        String nomRefPupDate = null;
179
        Partial pupDate = null;
180

    
181
        // preprocess nomRef: separate citation, reference detail, publishing date
182
        if(!StringUtils.isEmpty(nomRefStr)){
183
            nomRefStr = nomRefStr.trim();
184
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
185
            if(m.matches()){
186
                nomRefTitle = m.group(1);
187
                nomRefDetail = m.group(2);
188
                nomRefPupDate = m.group(3).trim();
189

    
190
                pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate);
191
                if (pupDate != null) {
192
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
193
                }
194
            } else {
195
                nomRefTitle = nomRefStr;
196
            }
197
        }
198

    
199
        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
200

    
201
        // always add the original strings of parsed data as annotation
202
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
203
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
204
                        "\n -  '" + TYPE + "': " + typeStr +
205
                        "\n -  '" + REGISTRATION  + "': " + regStr
206
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
207

    
208
        if(pupDate != null) {
209
            taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
210
        }
211

    
212
        if(!StringUtils.isEmpty(notesTxt)){
213
            notesTxt = notesTxt.replace("Notes: ", "").trim();
214
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
215
        }
216
        if(!StringUtils.isEmpty(caveats)){
217
            caveats = caveats.replace("Caveats: ", "").trim();
218
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
219
        }
220
        //
221

    
222
        // Namerelations
223
        if(!StringUtils.isEmpty(authorsSpelling)){
224
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
225

    
226
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
227
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
228

    
229
            ArrayUtils.reverse(authorSpellingTokens);
230
            ArrayUtils.reverse(nameStrTokens);
231

    
232
            for (int i = 0; i < nameStrTokens.length; i++){
233
                if(i < authorSpellingTokens.length){
234
                    nameStrTokens[i] = authorSpellingTokens[i];
235
                }
236
            }
237
            ArrayUtils.reverse(nameStrTokens);
238

    
239
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
240
            // build the fullnameString of the misspelled name
241
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
242

    
243
            TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
244
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
245
            getNameService().save(misspelledName);
246
        }
247

    
248
        // Replaced Synonyms
249
        if(!StringUtils.isEmpty(fullSynSubstStr)){
250
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
251
            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
252
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
253
            getNameService().save(replacedSynonymName);
254
        }
255

    
256
        Reference sec = state.getConfig().getSecReference();
257
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
258

    
259
        // Markers
260
        if(isFossil){
261
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
262
        }
263

    
264
        // Types
265
        if(!StringUtils.isEmpty(typeStr)){
266
            makeTypeData(typeStr, taxonName, regNumber);
267
        }
268

    
269
        getTaxonService().save(taxon);
270
        if(higherTaxonNode != null){
271
            higherTaxonNode.addChildTaxon(taxon, null, null);
272
            getTaxonNodeService().save(higherTaxonNode);
273
        }
274

    
275
        return taxon;
276

    
277
    }
278

    
279
    private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber) {
280

    
281
        Matcher m = typeSplitPattern.matcher(typeStr);
282

    
283
        if(m.matches()){
284
            String typeString = m.group(TypesName.type.name());
285
            boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
286

    
287
            if(isFieldUnit) {
288
                // type as fieldUnit
289
                FieldUnit fu = FieldUnit.NewInstance();
290
                fu.setTitleCache(typeString, true);
291
                getOccurrenceService().save(fu);
292

    
293
                // all others ..
294
                addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
295
                addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
296
            } else {
297
                TaxonNameBase typeName = nameParser.parseFullName(typeString);
298
                taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
299
            }
300
        }
301
        getNameService().save(taxonName);
302
    }
303

    
304
    private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
305

    
306
        Partial pupDate = null;
307
        boolean parseError = false;
308
        String nomRefPupDay = null;
309
        String nomRefPupMonth = null;
310
        String nomRefPupMonthName = null;
311
        String nomRefPupYear = null;
312

    
313

    
314
        // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
315
        for(Pattern p : nomRefPubDatePs){
316
            Matcher m2 = p.matcher(nomRefPupDate);
317
            if(m2.matches()){
318
                try {
319
                    nomRefPupYear = m2.group("year");
320
                } catch (IllegalArgumentException e){
321
                    // named capture group not found
322
                }
323
                try {
324
                    nomRefPupMonth = m2.group("month");
325
                } catch (IllegalArgumentException e){
326
                    // named capture group not found
327
                }
328
                try {
329
                    nomRefPupMonthName = m2.group("monthName");
330
                    nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
331
                    if(nomRefPupMonth == null){
332
                        parseError = true;
333
                    }
334
                } catch (IllegalArgumentException e){
335
                    // named capture group not found
336
                }
337
                try {
338
                    nomRefPupDay = m2.group("day");
339
                } catch (IllegalArgumentException e){
340
                    // named capture group not found
341
                }
342

    
343
                if(nomRefPupYear == null){
344
                    logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
345
                    parseError = true;
346
                }
347
                if(nomRefPupYear.length() == 2 ){
348
                    // it is an abbreviated year from the 19** years
349
                    nomRefPupYear = "19" + nomRefPupYear;
350
                }
351

    
352
                break;
353
            }
354
        }
355
        if(nomRefPupYear == null){
356
            logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
357
            parseError = true;
358
        }
359
        List<DateTimeFieldType> types = new ArrayList<>();
360
        List<Integer> values = new ArrayList<>();
361
        if(!parseError) {
362
            types.add(DateTimeFieldType.year());
363
            values.add(Integer.parseInt(nomRefPupYear));
364
            if (nomRefPupMonth != null) {
365
                types.add(DateTimeFieldType.monthOfYear());
366
                values.add(Integer.parseInt(nomRefPupMonth));
367
            }
368
            if (nomRefPupDay != null) {
369
                types.add(DateTimeFieldType.dayOfMonth());
370
                values.add(Integer.parseInt(nomRefPupDay));
371
            }
372
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
373
        }
374
        return pupDate;
375
    }
376

    
377
    private String monthFromName(String monthName, String regNumber) {
378

    
379
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
380
        if(month == null){
381
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
382
            return null;
383
        } else {
384
            return month.toString();
385
        }
386
    }
387

    
388

    
389
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
390

    
391
        if(StringUtils.isEmpty(typeStr)){
392
            return;
393
        }
394
        typeStr = typeStr.trim().replaceAll("\\.$", "");
395

    
396
        Collection collection = null;
397
        DerivedUnit specimen = null;
398

    
399
        List<DerivedUnit> specimens = new ArrayList<>();
400
        if(multiple){
401
            String[] tokens = typeStr.split("\\s?,\\s?");
402
            for (String t : tokens) {
403
                // command to  list all complex parsabel types:
404
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
405
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
406

    
407
                if(!t.isEmpty()){
408
                    // trying to parse the string
409
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
410
                    if(specimen != null){
411
                        specimens.add(specimen);
412
                    } else {
413
                        // parsing was not successful make simple specimen
414
                        specimens.add(makeSpecimenType(fieldUnit, t));
415
                    }
416
                }
417
            }
418
        } else {
419
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
420
            if(specimen != null) {
421
                specimens.add(specimen);
422
                // remember current collection
423
                collection = specimen.getCollection();
424
            } else {
425
                // parsing was not successful make simple specimen
426
                specimens.add(makeSpecimenType(fieldUnit, typeStr));
427
            }
428
        }
429

    
430
        for(DerivedUnit s : specimens){
431
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
432
       }
433
    }
434

    
435
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
436
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
437
        facade.setTitleCache(titleCache.trim(), true);
438
        specimen = facade.innerDerivedUnit();
439
        return specimen;
440
    }
441

    
442
    /**
443
     *
444
     * @param fieldUnit
445
     * @param typeName
446
     * @param collection
447
     * @param text
448
     * @param regNumber
449
     * @return
450
     */
451
    private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
452

    
453
        DerivedUnit specimen = null;
454

    
455
        String collectionCode = null;
456
        String subCollectionStr = null;
457
        String instituteStr = null;
458
        String accessionNumber = null;
459

    
460
        boolean unusualAccessionNumber = false;
461

    
462
        text = text.trim();
463

    
464
        // 1.  For Isotypes often the accession number is noted alone if the
465
        //     preceeding entry has a collection code.
466
        if(typeName .equals(TypesName.isotype) && collection != null){
467
            Matcher m = accessionNumberOnlyPattern.matcher(text);
468
            if(m.matches()){
469
                try {
470
                    accessionNumber = m.group("accNumber");
471
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
472
                } catch (IllegalArgumentException e){
473
                    // match group acc_number not found
474
                }
475
            }
476
        }
477

    
478
        //2. try it the 'normal' way
479
        if(specimen == null) {
480
            for (Pattern p : specimenTypePatterns) {
481
                Matcher m = p.matcher(text);
482
                if (m.matches()) {
483
                    // collection code is mandatory
484
                    try {
485
                        collectionCode = m.group("colCode");
486
                    } catch (IllegalArgumentException e){
487
                        logger.warn(csvReportLine(regNumber, "match group colCode not found"));
488
                        continue;
489
                    }
490
                    try {
491
                        subCollectionStr = m.group("subCollection");
492
                    } catch (IllegalArgumentException e){
493
                        // match group subCollection not found
494
                    }
495
                    try {
496
                        instituteStr = m.group("institute");
497
                    } catch (IllegalArgumentException e){
498
                        // match group col_name not found
499
                    }
500
                    try {
501
                        accessionNumber = m.group("accNumber");
502

    
503
                        // try to improve the accessionNumber
504
                        if(accessionNumber!= null) {
505
                            accessionNumber = accessionNumber.trim();
506
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
507
                            String betterAccessionNumber = null;
508
                            if (m2.matches()) {
509
                                try {
510
                                    betterAccessionNumber = m.group("accNumber");
511
                                } catch (IllegalArgumentException e) {
512
                                    // match group acc_number not found
513
                                }
514
                            }
515
                            if (betterAccessionNumber != null) {
516
                                accessionNumber = betterAccessionNumber;
517
                            } else {
518
                                unusualAccessionNumber = true;
519
                            }
520
                        }
521

    
522
                    } catch (IllegalArgumentException e){
523
                        // match group acc_number not found
524
                    }
525

    
526
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
527
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
528
                    break;
529
                }
530
            }
531
        }
532
        if(specimen == null) {
533
            logger.warn(csvReportLine(regNumber, "Could not parse specimen type", typeName.name().toString(), text));
534
        }
535
        if(unusualAccessionNumber){
536
            logger.warn(csvReportLine(regNumber, "Unusual accession number", typeName.name().toString(), text, accessionNumber));
537
        }
538
        return specimen;
539
    }
540

    
541
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
542

    
543
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
544
        facade.setCollection(collection);
545
        facade.setAccessionNumber(accessionNumber);
546
        return facade.innerDerivedUnit();
547
    }
548

    
549
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
550
                                            String authorStr, String nomRefTitle) {
551

    
552
        BotanicalName taxonName;// cache field for the taxonName.titleCache
553
        String taxonNameTitleCache = null;
554
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
555

    
556
        // TitleCache preprocessing
557
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
558
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
559
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
560
            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
561
        }
562

    
563
        // parse the full taxon name
564
        if(!StringUtils.isEmpty(nomRefTitle)){
565
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
566
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
567
            logger.debug(":::::" + taxonFullNameStr);
568
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
569
        } else {
570
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
571
        }
572

    
573
        taxonNameTitleCache = taxonName.getTitleCache().trim();
574
        if (taxonName.isProtectedTitleCache()) {
575
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
576
        } else {
577

    
578
            boolean doRestoreTitleCacheStr = false;
579

    
580
            // Check if titleCache and nameCache are plausible
581
            String titleCacheCompareStr = titleCacheStr;
582
            String nameCache = taxonName.getNameCache();
583
            String nameCompareStr = nameStr;
584
            if(taxonName.isBinomHybrid()){
585
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
586
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
587
            }
588
            if(taxonName.isMonomHybrid()){
589
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
590
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
591
            }
592
            if(authorStr != null && authorStr.contains(" et ")){
593
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
594
            }
595
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
596
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
597
                doRestoreTitleCacheStr = true;
598
            }
599
            if (!nameCache.trim().equals(nameCompareStr)) {
600
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
601
            }
602

    
603
            //  Author
604
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
605
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
606
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
607
            //    doRestoreTitleCacheStr = true;
608
            //}
609

    
610
            if(doRestoreTitleCacheStr){
611
                taxonName.setTitleCache(titleCacheStr, true);
612
            }
613

    
614
            // deduplicate
615
            replaceAuthorNamesAndNomRef(state, taxonName);
616
        }
617

    
618
        // Annotations
619
        if(!nameAnnotations.isEmpty()){
620
            for(String text : nameAnnotations.keySet()){
621
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
622
            }
623
            getNameService().save(taxonName);
624
        }
625
        return taxonName;
626
    }
627

    
628
    /**
629
     * @param state
630
     * @return
631
     */
632
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
633

    
634
     //   Classification classification = state.getClassification();
635
     //   if (classification == null){
636
     //       IAPTImportConfigurator config = state.getConfig();
637
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
638
     //       classification.setUuid(config.getClassificationUuid());
639
     //       classification.setReference(config.getSecReference());
640
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
641
     //   }
642
        TaxonNode rootNode = state.getRootNode();
643
        if (rootNode == null){
644
            rootNode = getTaxonNodeService().find(ROOT_UUID);
645
        }
646
        if (rootNode == null){
647
            Classification classification = state.getClassification();
648
            if (classification == null){
649
                Reference sec = state.getSecReference();
650
                String classificationName = state.getConfig().getClassificationName();
651
                Language language = Language.DEFAULT();
652
                classification = Classification.NewInstance(classificationName, sec, language);
653
                state.setClassification(classification);
654
                classification.setUuid(state.getConfig().getClassificationUuid());
655
                classification.getRootNode().setUuid(ROOT_UUID);
656
                getClassificationService().save(classification);
657
            }
658
            rootNode = classification.getRootNode();
659
            state.setRootNode(rootNode);
660
        }
661
        return rootNode;
662
    }
663

    
664
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
665

    
666
        Collection superCollection = null;
667
        if(subCollectionStr != null){
668
            superCollection = getCollection(collectionCode, instituteStr, null);
669
            collectionCode = subCollectionStr;
670
            instituteStr = null;
671
        }
672

    
673
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
674

    
675
        Collection collection = collectionMap.get(key);
676

    
677
        if(collection == null) {
678
            collection = Collection.NewInstance();
679
            collection.setCode(collectionCode);
680
            if(instituteStr != null){
681
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
682
            }
683
            if(superCollection != null){
684
                collection.setSuperCollection(superCollection);
685
            }
686
            collectionMap.put(key, collection);
687
            getCollectionService().save(collection);
688
        }
689

    
690
        return collection;
691
    }
692

    
693

    
694
    /**
695
     * @param record
696
     * @param originalKey
697
     * @param doUnescapeHtmlEntities
698
     * @return
699
     */
700
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
701
        String value = record.get(originalKey);
702

    
703
        value = fixCharacters(value);
704

    
705
        if (! StringUtils.isBlank(value)) {
706
        	if (logger.isDebugEnabled()) {
707
        	    logger.debug(originalKey + ": " + value);
708
        	}
709
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
710
            if(doUnescapeHtmlEntities){
711
                value = StringEscapeUtils.unescapeHtml(value);
712
            }
713
        	return value.trim();
714
        }else{
715
        	return null;
716
        }
717
    }
718

    
719
    /**
720
     * Fixes broken characters.
721
     * For details see
722
     * http://dev.e-taxonomy.eu/redmine/issues/6035
723
     *
724
     * @param value
725
     * @return
726
     */
727
    private String fixCharacters(String value) {
728

    
729
        value = StringUtils.replace(value, "s$K", "š");
730
        value = StringUtils.replace(value, "n$K", "ň");
731
        value = StringUtils.replace(value, "e$K", "ě");
732
        value = StringUtils.replace(value, "r$K", "ř");
733
        value = StringUtils.replace(value, "c$K", "č");
734
        value = StringUtils.replace(value, "z$K", "ž");
735
        value = StringUtils.replace(value, "S>U$K", "Š");
736
        value = StringUtils.replace(value, "C>U$K", "Č");
737
        value = StringUtils.replace(value, "R>U$K", "Ř");
738
        value = StringUtils.replace(value, "Z>U$K", "Ž");
739
        value = StringUtils.replace(value, "g$K", "ǧ");
740
        value = StringUtils.replace(value, "s$A", "ś");
741
        value = StringUtils.replace(value, "n$A", "ń");
742
        value = StringUtils.replace(value, "c$A", "ć");
743
        value = StringUtils.replace(value, "e$E", "ę");
744
        value = StringUtils.replace(value, "o$H", "õ");
745
        value = StringUtils.replace(value, "s$C", "ş");
746
        value = StringUtils.replace(value, "t$C", "ț");
747
        value = StringUtils.replace(value, "S>U$C", "Ş");
748
        value = StringUtils.replace(value, "a$O", "å");
749
        value = StringUtils.replace(value, "A>U$O", "Å");
750
        value = StringUtils.replace(value, "u$O", "ů");
751
        value = StringUtils.replace(value, "g$B", "ğ");
752
        value = StringUtils.replace(value, "g$B", "ĕ");
753
        value = StringUtils.replace(value, "a$B", "ă");
754
        value = StringUtils.replace(value, "l$/", "ł");
755
        value = StringUtils.replace(value, ">i", "ı");
756
        value = StringUtils.replace(value, "i$U", "ï");
757
        // Special-cases
758
        value = StringUtils.replace(value, "&yacute", "ý");
759
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
760
        value = StringUtils.replace(value, "E>U$D", "З");
761
        value = StringUtils.replace(value, "S>U$E", "Ş");
762
        value = StringUtils.replace(value, "s$E", "ş");
763

    
764
        value = StringUtils.replace(value, "c$k", "č");
765
        value = StringUtils.replace(value, " U$K", " Š");
766

    
767
        return value;
768
    }
769

    
770

    
771
    /**
772
	 *  Stores taxa records in DB
773
	 */
774
	@Override
775
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
776

    
777
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
778
        logger.setLevel(Level.DEBUG);
779
        HashMap<String, String> record = state.getOriginalRecord();
780
        logger.debug(lineNumber + record.toString());
781

    
782
        Set<String> keys = record.keySet();
783
        for (String key: keys) {
784
            if (! expectedKeys.contains(key)){
785
                logger.warn(lineNumber + "Unexpected Key: " + key);
786
            }
787
        }
788

    
789
        String reg_id = record.get(REGISTRATIONNO_PK);
790

    
791
        //higherTaxon
792
        String higherTaxaString = record.get(HIGHERTAXON);
793
        boolean isFossil = false;
794
        if(higherTaxaString.startsWith("FOSSIL ")){
795
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
796
            isFossil = true;
797
        }
798
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
799

    
800
       //Taxon
801
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
802
        if (taxon == null){
803
            logger.warn(lineNumber + "taxon could not be created and is null");
804
            return;
805
        }
806
        ((IAPTImportState)state).setCurrentTaxon(taxon);
807

    
808

    
809
		return;
810
    }
811

    
812
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
813
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
814
        TaxonNode higherTaxonNode = null;
815

    
816
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
817
        for (String htn :  higherTaxaNames) {
818
            htn = StringUtils.capitalize(htn.trim());
819
            Taxon higherTaxon = state.getHigherTaxon(htn);
820
            if (higherTaxon != null){
821
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
822
            }else{
823
                BotanicalName name = makeHigherTaxonName(state, htn);
824
                Reference sec = state.getSecReference();
825
                higherTaxon = Taxon.NewInstance(name, sec);
826
                getTaxonService().save(higherTaxon);
827
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
828
                state.putHigherTaxon(htn, higherTaxon);
829
                getClassificationService().saveTreeNode(higherTaxonNode);
830
            }
831
            rootNode = higherTaxonNode;
832
        }
833
        return higherTaxonNode;
834
    }
835

    
836
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
837

    
838
        Rank rank = guessRank(name);
839

    
840
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
841
        taxonName.addSource(makeOriginalSource(state));
842
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
843
        return taxonName;
844
    }
845

    
846
    private Rank guessRank(String name) {
847

    
848
        // normalize
849
        name = name.replaceAll("\\(.*\\)", "").trim();
850

    
851
        if(name.matches("^Plantae$|^Fungi$")){
852
           return Rank.KINGDOM();
853
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
854
           return rankFamilyIncertisSedis();
855
        } else if(name.matches(".*phyta$|.*mycota$")){
856
           return Rank.SECTION_BOTANY();
857
        } else if(name.matches(".*phytina$|.*mycotina$")){
858
           return Rank.SUBSECTION_BOTANY();
859
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
860
            return rankUnrankedSupraGeneric();
861
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
862
           return Rank.CLASS();
863
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
864
           return Rank.SUBCLASS();
865
        } else if(name.matches(".*ales$")){
866
           return Rank.ORDER();
867
        } else if(name.matches(".*ineae$")){
868
           return Rank.SUBORDER();
869
        } else if(name.matches(".*aceae$")){
870
            return Rank.FAMILY();
871
        } else if(name.matches(".*oideae$")){
872
           return Rank.SUBFAMILY();
873
        } else
874
        //    if(name.matches(".*eae$")){
875
        //    return Rank.TRIBE();
876
        // } else
877
            if(name.matches(".*inae$")){
878
           return Rank.SUBTRIBE();
879
        } else if(name.matches(".*ae$")){
880
           return Rank.FAMILY();
881
        }
882
        return Rank.UNKNOWN_RANK();
883
    }
884

    
885
    private Rank rankUnrankedSupraGeneric() {
886

    
887
        if(rankUnrankedSupraGeneric == null){
888
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
889
            getTermService().save(rankUnrankedSupraGeneric);
890
        }
891
        return rankUnrankedSupraGeneric;
892
    }
893

    
894
    private Rank rankFamilyIncertisSedis() {
895

    
896
        if(familyIncertisSedis == null){
897
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
898
            getTermService().save(familyIncertisSedis);
899
        }
900
        return familyIncertisSedis;
901
    }
902

    
903
    private AnnotationType annotationTypeCaveats(){
904
        if(annotationTypeCaveats == null){
905
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
906
            getTermService().save(annotationTypeCaveats);
907
        }
908
        return annotationTypeCaveats;
909
    }
910

    
911

    
912
    /**
913
     * @param state
914
     * @return
915
     */
916
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
917
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
918
    }
919

    
920

    
921
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
922
        Reference ref = state.getReference(uuidRef);
923
        if (ref == null){
924
            ref = getReferenceService().find(uuidRef);
925
            state.putReference(uuidRef, ref);
926
        }
927
        return ref;
928
    }
929

    
930
    private MarkerType markerTypeFossil(){
931
        if(this.markerTypeFossil == null){
932
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
933
            getTermService().save(this.markerTypeFossil);
934
        }
935
        return markerTypeFossil;
936
    }
937

    
938
    private String csvReportLine(String regId, String message, String ... fields){
939
        StringBuilder out = new StringBuilder("regID#");
940
        out.append(regId).append(",\"").append(message).append('"');
941

    
942
        for(String f : fields){
943
            out.append(",\"").append(f).append('"');
944
        }
945
        return out.toString();
946
    }
947

    
948

    
949
}
(1-1/4)