Project

General

Profile

Download (59.6 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.io.phycobank;
11

    
12
import java.util.ArrayList;
13
import java.util.Arrays;
14
import java.util.HashMap;
15
import java.util.HashSet;
16
import java.util.List;
17
import java.util.Map;
18
import java.util.Set;
19
import java.util.UUID;
20
import java.util.regex.Matcher;
21
import java.util.regex.Pattern;
22

    
23
import org.apache.commons.lang.ArrayUtils;
24
import org.apache.commons.lang.StringEscapeUtils;
25
import org.apache.commons.lang.StringUtils;
26
import org.apache.logging.log4j.Level;
27
import org.apache.logging.log4j.LogManager;
28
import org.apache.logging.log4j.Logger;
29
import org.joda.time.DateTimeFieldType;
30
import org.joda.time.Partial;
31
import org.joda.time.format.DateTimeFormat;
32
import org.joda.time.format.DateTimeFormatter;
33
import org.springframework.stereotype.Component;
34

    
35
import com.fasterxml.jackson.core.JsonProcessingException;
36
import com.fasterxml.jackson.databind.ObjectMapper;
37

    
38
import eu.etaxonomy.cdm.common.CdmUtils;
39
import eu.etaxonomy.cdm.common.LogUtils;
40
import eu.etaxonomy.cdm.facade.DerivedUnitFacade;
41
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
42
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
43
import eu.etaxonomy.cdm.model.agent.Institution;
44
import eu.etaxonomy.cdm.model.agent.Person;
45
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
46
import eu.etaxonomy.cdm.model.common.Annotation;
47
import eu.etaxonomy.cdm.model.common.AnnotationType;
48
import eu.etaxonomy.cdm.model.common.Extension;
49
import eu.etaxonomy.cdm.model.common.ExtensionType;
50
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
51
import eu.etaxonomy.cdm.model.common.Language;
52
import eu.etaxonomy.cdm.model.common.LanguageString;
53
import eu.etaxonomy.cdm.model.common.Marker;
54
import eu.etaxonomy.cdm.model.common.MarkerType;
55
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
56
import eu.etaxonomy.cdm.model.name.IBotanicalName;
57
import eu.etaxonomy.cdm.model.name.NameRelationshipType;
58
import eu.etaxonomy.cdm.model.name.NameTypeDesignation;
59
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
60
import eu.etaxonomy.cdm.model.name.NomenclaturalStatus;
61
import eu.etaxonomy.cdm.model.name.NomenclaturalStatusType;
62
import eu.etaxonomy.cdm.model.name.Rank;
63
import eu.etaxonomy.cdm.model.name.RankClass;
64
import eu.etaxonomy.cdm.model.name.SpecimenTypeDesignationStatus;
65
import eu.etaxonomy.cdm.model.name.TaxonName;
66
import eu.etaxonomy.cdm.model.name.TaxonNameFactory;
67
import eu.etaxonomy.cdm.model.occurrence.Collection;
68
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
69
import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
70
import eu.etaxonomy.cdm.model.occurrence.GatheringEvent;
71
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
72
import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
73
import eu.etaxonomy.cdm.model.reference.Reference;
74
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
75
import eu.etaxonomy.cdm.model.taxon.Classification;
76
import eu.etaxonomy.cdm.model.taxon.ITaxonTreeNode;
77
import eu.etaxonomy.cdm.model.taxon.Synonym;
78
import eu.etaxonomy.cdm.model.taxon.SynonymType;
79
import eu.etaxonomy.cdm.model.taxon.Taxon;
80
import eu.etaxonomy.cdm.model.taxon.TaxonNode;
81
import eu.etaxonomy.cdm.model.term.DefinedTermBase;
82
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
83

    
84
/**
85
 * @author a.mueller
86
 * @since 05.01.2016
87
 */
88

    
89
@Component("iAPTExcelImport")
90
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
91
    private static final long serialVersionUID = -747486709409732371L;
92
    private static final Logger logger = LogManager.getLogger();
93

    
94
    public static final String ANNOTATION_MARKER_STRING = "[*]";
95

    
96
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
97

    
98
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
99

    
100
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
101
    private final static String HIGHERTAXON= "HigherTaxon";
102
    private final static String FULLNAME= "FullName";
103
    private final static String AUTHORSSPELLING= "AuthorsSpelling";
104
    private final static String LITSTRING= "LitString";
105
    private final static String REGISTRATION= "Registration";
106
    private final static String TYPE= "Type";
107
    private final static String CAVEATS= "Caveats";
108
    private final static String FULLBASIONYM= "FullBasionym";
109
    private final static String FULLSYNSUBST= "FullSynSubst";
110
    private final static String NOTESTXT= "NotesTxt";
111
    private final static String REGDATE= "RegDate";
112
    private final static String NAMESTRING= "NameString";
113
    private final static String BASIONYMSTRING= "BasionymString";
114
    private final static String SYNSUBSTSTR= "SynSubstStr";
115
    private final static String AUTHORSTRING= "AuthorString";
116

    
117
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
118
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
119

    
120
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
121
    private static final Pattern[] datePatterns = new Pattern[]{
122
            // NOTE:
123
            // The order of the patterns is extremely important!!!
124
            //
125
            // all patterns cover the years 1700 - 1999
126
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
127
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
128
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
129
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
130
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
131
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
132
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
133
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
134
            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
135
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
136
        };
137
    protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
138

    
139
    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
140
    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
141
    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
142

    
143
    protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
144
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
145
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
146

    
147
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
148
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
149

    
150
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
151
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
152
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
153
            Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
154
            Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
155
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
156
    };
157

    
158

    
159
    private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
160

    
161
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
162

    
163
    static {
164
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
165
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
166
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
167
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
168
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
169
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
170
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
171
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
172
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
173
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
174

    
175
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
176

    
177
        for (String[] months: perLang) {
178
            for(int m = 1; m < 13; m++){
179
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
180
            }
181
        }
182

    
183
        // special cases
184
        monthFromNameMap.put("mar", 3);
185
        monthFromNameMap.put("dec", 12);
186
        monthFromNameMap.put("februari", 2);
187
        monthFromNameMap.put("març", 3);
188
    }
189

    
190

    
191
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
192

    
193
    private Map<String, Collection> collectionMap = new HashMap<>();
194

    
195
    private ExtensionType extensionTypeIAPTRegData = null;
196

    
197
    private Set<String> nameSet = new HashSet<>();
198
    private DefinedTermBase duplicateRegistration = null;
199

    
200
    enum TypesName {
201
        fieldUnit, holotype, isotype;
202

    
203
        public SpecimenTypeDesignationStatus status(){
204
            switch (this) {
205
                case holotype:
206
                    return SpecimenTypeDesignationStatus.HOLOTYPE();
207
                case isotype:
208
                    return SpecimenTypeDesignationStatus.ISOTYPE();
209
                default:
210
                    return null;
211
            }
212
        }
213
    }
214

    
215
    private MarkerType markerTypeFossil = null;
216
    private Rank rankUnrankedSupraGeneric = null;
217
    private Rank familyIncertisSedis = null;
218
    private AnnotationType annotationTypeCaveats = null;
219

    
220
    private Reference bookVariedadesTradicionales = null;
221

    
222
    /**
223
     * HACK for unit simple testing
224
     */
225
    boolean _testMode = System.getProperty("TEST_MODE") != null;
226

    
227
    private Taxon makeTaxon(Map<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
228
                            TaxonNode higherTaxonNode, boolean isFossil) {
229

    
230
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
231
        String regStr = getValue(record, REGISTRATION, true);
232
        String titleCacheStr = getValue(record, FULLNAME, true);
233
        String nameStr = getValue(record, NAMESTRING, true);
234
        String authorStr = getValue(record, AUTHORSTRING, true);
235
        String nomRefStr = getValue(record, LITSTRING, true);
236
        String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
237
        String notesTxt = getValue(record, NOTESTXT, true);
238
        String caveats = getValue(record, CAVEATS, true);
239
        String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
240
        String fullBasionymStr = getValue(record, FULLBASIONYM, true);
241
        String basionymNameStr = getValue(record, FULLBASIONYM, true);
242
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
243
        String typeStr = getValue(record, TYPE, true);
244

    
245

    
246
        String nomRefTitle = null;
247
        String nomRefDetail;
248
        String nomRefPupDate = null;
249
        String nomRefIssue = null;
250
        Partial pupDate = null;
251

    
252
        boolean restoreOriginalReference = false;
253
        boolean nameIsValid = true;
254

    
255
        // preprocess nomRef: separate citation, reference detail, publishing date
256
        if(!StringUtils.isEmpty(nomRefStr)){
257
            nomRefStr = nomRefStr.trim();
258

    
259
            // handle the special case which is hard to parse:
260
            //
261
            // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
262
            if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
263

    
264
                if(bookVariedadesTradicionales == null){
265
                    bookVariedadesTradicionales = ReferenceFactory.newBook();
266
                    bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
267
                    bookVariedadesTradicionales.setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(1997));
268
                    getReferenceService().save(bookVariedadesTradicionales);
269
                }
270
                nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
271
                restoreOriginalReference = true;
272
            }
273

    
274
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
275
            if(m.matches()){
276
                nomRefTitle = m.group("title");
277
                nomRefDetail = m.group("detail");
278
                nomRefPupDate = m.group("date").trim();
279
                nomRefIssue = m.group("issue");
280

    
281
                pupDate = parseDate(regNumber, nomRefPupDate);
282
                if (pupDate != null) {
283
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
284
                } else {
285
                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
286
                }
287
            } else {
288
                nomRefTitle = nomRefStr;
289
            }
290
        }
291

    
292
        TaxonName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
293

    
294
        // always add the original strings of parsed data as annotation
295
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
296
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
297
                        "\n -  '" + TYPE + "': " + typeStr +
298
                        "\n -  '" + REGISTRATION  + "': " + regStr
299
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
300

    
301
        if(restoreOriginalReference){
302
            taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
303
        }
304

    
305
        if(taxonName.getNomenclaturalReference() != null){
306
            if(pupDate != null) {
307
                taxonName.getNomenclaturalReference().setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(pupDate));
308
            }
309
            if(nomRefIssue != null) {
310
                taxonName.getNomenclaturalReference().setVolume(nomRefIssue);
311
            }
312
        }
313

    
314

    
315
        if(!StringUtils.isEmpty(notesTxt)){
316
            notesTxt = notesTxt.replace("Notes: ", "").trim();
317
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
318
            nameIsValid = false;
319

    
320
        }
321
        if(!StringUtils.isEmpty(caveats)){
322
            caveats = caveats.replace("Caveats: ", "").trim();
323
            taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
324
            nameIsValid = false;
325
        }
326

    
327
        if(nameIsValid){
328
            // Status is always considered valid if no notes and cavets are set
329
            taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
330
        }
331

    
332
        getNameService().save(taxonName);
333

    
334
        // Namerelations
335
        if(!StringUtils.isEmpty(authorsSpelling)){
336
            authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
337

    
338
            String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
339
            String[] nameStrTokens = StringUtils.split(nameStr, " ");
340

    
341
            ArrayUtils.reverse(authorSpellingTokens);
342
            ArrayUtils.reverse(nameStrTokens);
343

    
344
            for (int i = 0; i < nameStrTokens.length; i++){
345
                if(i < authorSpellingTokens.length){
346
                    nameStrTokens[i] = authorSpellingTokens[i];
347
                }
348
            }
349
            ArrayUtils.reverse(nameStrTokens);
350

    
351
            String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
352
            // build the fullnameString of the misspelled name
353
            misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
354

    
355
            TaxonName misspelledName = nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
356
            misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null, null);
357
            getNameService().save(misspelledName);
358
        }
359

    
360
        // Replaced Synonyms
361
        if(!StringUtils.isEmpty(fullSynSubstStr)){
362
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
363
            TaxonName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
364
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null, null);
365
            getNameService().save(replacedSynonymName);
366
        }
367

    
368
        Reference sec = state.getConfig().getSecReference();
369
        Taxon taxon = Taxon.NewInstance(taxonName, sec);
370

    
371
        // Basionym
372
        if(fullBasionymStr != null){
373
            fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
374
            basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
375
            TaxonName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
376
            getNameService().save(basionym);
377
            taxonName.addBasionym(basionym);
378

    
379
            Synonym syn = Synonym.NewInstance(basionym, sec);
380
            taxon.addSynonym(syn, SynonymType.HOMOTYPIC_SYNONYM_OF);
381
            getTaxonService().save(syn);
382
        }
383

    
384
        // Markers
385
        if(isFossil){
386
            taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
387
        }
388
        if(!nameSet.add(titleCacheStr)){
389
            taxonName.addMarker(Marker.NewInstance(markerDuplicateRegistration(), true));
390
            logger.warn(csvReportLine(regNumber, "Duplicate registration of", titleCacheStr));
391
        }
392

    
393

    
394
        // Types
395
        if(!StringUtils.isEmpty(typeStr)){
396

    
397
            if(taxonName.getRank().isSpecies() || taxonName.getRank().isLowerThan(RankClass.Species)) {
398
                makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
399
            } else {
400
                makeNameTypeData(typeStr, taxonName, regNumber, state);
401
            }
402
        }
403

    
404
        getTaxonService().save(taxon);
405

    
406
        if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLowerThan(RankClass.Species)){
407
            // try to find the genus, it should have been imported already, Genera are coming first in the import file
408
            Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
409
            if(genus != null){
410
                higherTaxonNode = genus.getTaxonNodes().iterator().next();
411
            } else {
412
                logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
413
            }
414
        }
415

    
416
        if(higherTaxonNode != null){
417
            higherTaxonNode.addChildTaxon(taxon, null, null);
418
            getTaxonNodeService().save(higherTaxonNode);
419
        }
420

    
421
        if(taxonName.getRank().isGenus()){
422
            ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
423
        }
424

    
425
        return taxon;
426
    }
427

    
428
    private void makeSpecimenTypeData(String typeStr, TaxonName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
429

    
430
        Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
431

    
432
        if(m.matches()){
433
            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
434
            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
435
            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
436
            if(fieldUnit == null) {
437
                // create a field unit with only a titleCache using the fieldUnitStr substring
438
                logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
439
                fieldUnit = FieldUnit.NewInstance();
440
                fieldUnit.setTitleCache(fieldUnitStr, true);
441
                getOccurrenceService().save(fieldUnit);
442
            }
443
            getOccurrenceService().save(fieldUnit);
444

    
445
            SpecimenOrObservationType specimenType;
446
            if(isFossil){
447
                specimenType = SpecimenOrObservationType.Fossil;
448
            } else {
449
                specimenType = SpecimenOrObservationType.PreservedSpecimen;
450
            }
451

    
452
            // all others ..
453
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
454
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
455

    
456
        } else {
457
            // create a field unit with only a titleCache using the full typeStr
458
            FieldUnit fieldUnit = FieldUnit.NewInstance();
459
            fieldUnit.setTitleCache(typeStr, true);
460
            getOccurrenceService().save(fieldUnit);
461
            logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
462
        }
463
        getNameService().save(taxonName);
464
    }
465

    
466
    private void makeNameTypeData(String typeStr, IBotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
467

    
468
        String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
469
        if(nameStr.isEmpty()) {
470
            return;
471
        }
472

    
473
        String basionymNameStr = null;
474
        String noteStr = null;
475
        String agentStr = null;
476

    
477
        Matcher m;
478

    
479
        if(typeStr.startsWith("not to be indicated")){
480
            // Special case:
481
            // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
482
            // FIXME
483
            m = typeNameSpecialSplitPattern.matcher(nameStr);
484
            if(m.matches()){
485
                nameStr = m.group("name");
486
                noteStr = m.group("note");
487
                agentStr = m.group("agent");
488
                // TODO better import of agent?
489
                if(agentStr != null){
490
                    noteStr = noteStr + ": " + agentStr;
491
                }
492
            }
493
        } else {
494
            // Generic case
495
            m = typeNameBasionymPattern.matcher(nameStr);
496
            if (m.find()) {
497
                basionymNameStr = m.group("basionymName");
498
                if (basionymNameStr != null) {
499
                    nameStr = nameStr.replace(m.group(0), "");
500
                }
501
            }
502

    
503
            m = typeNameNotePattern.matcher(nameStr);
504
            if (m.find()) {
505
                noteStr = m.group(1);
506
                if (noteStr != null) {
507
                    nameStr = nameStr.replace(m.group(0), "");
508
                }
509
            }
510
        }
511

    
512
        TaxonName typeName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
513

    
514
        if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
515
            logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
516
        }
517

    
518
        if(basionymNameStr != null){
519
            TaxonName basionymName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
520
            getNameService().save(basionymName);
521
            typeName.addBasionym(basionymName);
522
        }
523

    
524

    
525
        NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
526
        nameTypeDesignation.setTypeName(typeName);
527
        getNameService().save(typeName);
528

    
529
        if(noteStr != null){
530
            nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
531
        }
532
        taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
533

    
534
    }
535

    
536
    /**
537
     * Currently only parses the collector, fieldNumber and the collection date.
538
     *
539
     * @param fieldUnitStr
540
     * @param regNumber
541
     * @param state
542
     * @return null if the fieldUnitStr could not be parsed
543
     */
544
    private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
545

    
546
        FieldUnit fieldUnit = null;
547

    
548
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
549
        if(m1.matches()){
550

    
551
            String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
552
            String removal = m1.group(1);
553
            if(collectorData == null){
554
                collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
555
                removal = m1.group(3);
556
            }
557
            if(collectorData == null){
558
                collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
559
                removal = null;
560
            }
561
            if(collectorData == null){
562
                return null;
563
            }
564

    
565
            // the fieldUnitStr is parsable
566
            // remove all collectorData from the fieldUnitStr and use the rest as locality
567
            String locality = null;
568
            if(removal != null){
569
                locality = fieldUnitStr.replace(removal, "");
570
            }
571

    
572
            String collectorStr = null;
573
            String detailStr = null;
574
            Partial date = null;
575
            String fieldNumber = null;
576

    
577
            Matcher m2 = collectionDataPattern.matcher(collectorData);
578
            if(m2.matches()){
579
                collectorStr = m2.group("collector");
580
                detailStr = m2.group("detail");
581

    
582
                // Try to make sense of the detailStr
583
                if(detailStr != null){
584
                    detailStr = detailStr.trim();
585
                    // 1. try to parse as date
586
                    date = parseDate(regNumber, detailStr);
587
                    if(date == null){
588
                        // 2. try to parse as number
589
                        if(collectorsNumber.matcher(detailStr).matches()){
590
                            fieldNumber = detailStr;
591
                        }
592
                    }
593
                }
594
                if(date == null && fieldNumber == null){
595
                    // detailed parsing not possible, so need fo fallback
596
                    collectorStr = collectorData;
597
                }
598
            }
599

    
600
            if(collectorStr == null) {
601
                collectorStr = collectorData;
602
            }
603

    
604
            fieldUnit = FieldUnit.NewInstance();
605
            GatheringEvent ge = GatheringEvent.NewInstance();
606
            if(locality != null){
607
                ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
608
            }
609

    
610
            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
611
            if(agent == null) {
612
                agent = Person.NewTitledInstance(collectorStr);
613
                getAgentService().save(agent);
614
                state.putAgentBase(collectorStr, agent);
615
            }
616
            ge.setCollector(agent);
617

    
618
            if(date != null){
619
                ge.setGatheringDate(date);
620
            }
621

    
622
            getEventBaseService().save(ge);
623
            fieldUnit.setGatheringEvent(ge);
624

    
625
            if(fieldNumber != null) {
626
                fieldUnit.setFieldNumber(fieldNumber);
627
            }
628
            getOccurrenceService().save(fieldUnit);
629

    
630
        }
631

    
632
        return fieldUnit;
633
    }
634

    
635
    protected Partial parseDate(String regNumber, String dateStr) {
636

    
637
        Partial pupDate = null;
638
        boolean parseError = false;
639

    
640
        String day = null;
641
        String month = null;
642
        String monthName = null;
643
        String year = null;
644

    
645
        for(Pattern p : datePatterns){
646
            Matcher m2 = p.matcher(dateStr);
647
            if(m2.matches()){
648
                try {
649
                    year = m2.group("year");
650
                } catch (IllegalArgumentException e){
651
                    // named capture group not found
652
                }
653
                try {
654
                    month = m2.group("month");
655
                } catch (IllegalArgumentException e){
656
                    // named capture group not found
657
                }
658

    
659
                try {
660
                    monthName = m2.group("monthName");
661
                    month = monthFromName(monthName, regNumber);
662
                    if(month == null){
663
                        parseError = true;
664
                    }
665
                } catch (IllegalArgumentException e){
666
                    // named capture group not found
667
                }
668
                try {
669
                    day = m2.group("day");
670
                } catch (IllegalArgumentException e){
671
                    // named capture group not found
672
                }
673

    
674
                if(year != null){
675
                    if (year.length() == 2) {
676
                        // it is an abbreviated year from the 19** years
677
                        year = "19" + year;
678
                    }
679
                    break;
680
                } else {
681
                    parseError = true;
682
                }
683
            }
684
        }
685
        if(year == null){
686
            parseError = true;
687
        }
688
        List<DateTimeFieldType> types = new ArrayList<>();
689
        List<Integer> values = new ArrayList<>();
690
        if(!parseError) {
691
            types.add(DateTimeFieldType.year());
692
            values.add(Integer.parseInt(year));
693
            if (month != null) {
694
                types.add(DateTimeFieldType.monthOfYear());
695
                values.add(Integer.parseInt(month));
696
            }
697
            if (day != null) {
698
                types.add(DateTimeFieldType.dayOfMonth());
699
                values.add(Integer.parseInt(day));
700
            }
701
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
702
        }
703
        return pupDate;
704
    }
705

    
706
    private String monthFromName(String monthName, String regNumber) {
707

    
708
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
709
        if(month == null){
710
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
711
            return null;
712
        } else {
713
            return month.toString();
714
        }
715
    }
716

    
717

    
718
    private void addSpecimenTypes(IBotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
719

    
720
        if(StringUtils.isEmpty(typeStr)){
721
            return;
722
        }
723
        typeStr = typeStr.trim().replaceAll("\\.$", "");
724

    
725
        Collection collection = null;
726
        DerivedUnit specimen = null;
727

    
728
        List<DerivedUnit> specimens = new ArrayList<>();
729
        if(multiple){
730
            String[] tokens = typeStr.split("\\s?,\\s?");
731
            for (String t : tokens) {
732
                // command to  list all complex parsabel types:
733
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
734
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
735

    
736
                if(!t.isEmpty()){
737
                    // trying to parse the string
738
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
739
                    if(specimen != null){
740
                        specimens.add(specimen);
741
                    } else {
742
                        // parsing was not successful make simple specimen
743
                        specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
744
                    }
745
                }
746
            }
747
        } else {
748
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
749
            if(specimen != null) {
750
                specimens.add(specimen);
751
                // remember current collection
752
                collection = specimen.getCollection();
753
            } else {
754
                // parsing was not successful make simple specimen
755
                specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
756
            }
757
        }
758

    
759
        for(DerivedUnit s : specimens){
760
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
761
       }
762
    }
763

    
764
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
765
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
766
        facade.setTitleCache(titleCache.trim(), true);
767
        specimen = facade.innerDerivedUnit();
768
        return specimen;
769
    }
770

    
771
    /**
772
     *
773
     * @param fieldUnit
774
     * @param typeName
775
     * @param collection
776
     * @param text
777
     * @param regNumber
778
     * @return
779
     */
780
    protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
781

    
782
        DerivedUnit specimen = null;
783

    
784
        String collectionCode = null;
785
        String subCollectionStr = null;
786
        String instituteStr = null;
787
        String accessionNumber = null;
788

    
789
        boolean unusualAccessionNumber = false;
790

    
791
        text = text.trim();
792

    
793
        // 1.  For Isotypes often the accession number is noted alone if the
794
        //     preceeding entry has a collection code.
795
        if(typeName .equals(TypesName.isotype) && collection != null){
796
            Matcher m = accessionNumberOnlyPattern.matcher(text);
797
            if(m.matches()){
798
                try {
799
                    accessionNumber = m.group("accNumber");
800
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
801
                } catch (IllegalArgumentException e){
802
                    // match group acc_number not found
803
                }
804
            }
805
        }
806

    
807
        //2. try it the 'normal' way
808
        if(specimen == null) {
809
            for (Pattern p : specimenTypePatterns) {
810
                Matcher m = p.matcher(text);
811
                if (m.matches()) {
812
                    // collection code or collectionTitle is mandatory
813
                    try {
814
                        collectionCode = m.group("colCode");
815
                    } catch (IllegalArgumentException e){
816
                        // match group colCode not found
817
                    }
818

    
819
                    try {
820
                        instituteStr = m.group("institute");
821
                    } catch (IllegalArgumentException e){
822
                        // match group col_name not found
823
                    }
824

    
825
                    try {
826
                        subCollectionStr = m.group("subCollection");
827
                    } catch (IllegalArgumentException e){
828
                        // match group subCollection not found
829
                    }
830
                    try {
831
                        accessionNumber = m.group("accNumber");
832

    
833
                        // try to improve the accessionNumber
834
                        if(accessionNumber!= null) {
835
                            accessionNumber = accessionNumber.trim();
836
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
837
                            String betterAccessionNumber = null;
838
                            if (m2.matches()) {
839
                                try {
840
                                    betterAccessionNumber = m.group("accNumber");
841
                                } catch (IllegalArgumentException e) {
842
                                    // match group acc_number not found
843
                                }
844
                            }
845
                            if (betterAccessionNumber != null) {
846
                                accessionNumber = betterAccessionNumber;
847
                            } else {
848
                                unusualAccessionNumber = true;
849
                            }
850
                        }
851

    
852
                    } catch (IllegalArgumentException e){
853
                        // match group acc_number not found
854
                    }
855

    
856
                    if(collectionCode == null && instituteStr == null){
857
                        logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
858
                        continue;
859
                    }
860
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
861
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
862
                    break;
863
                }
864
            }
865
        }
866
        if(specimen == null) {
867
            logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
868
        }
869
        if(unusualAccessionNumber){
870
            logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
871
        }
872
        return specimen;
873
    }
874

    
875
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
876

    
877
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
878
        facade.setCollection(collection);
879
        if(accessionNumber != null){
880
            facade.setAccessionNumber(accessionNumber);
881
        }
882
        return facade.innerDerivedUnit();
883
    }
884

    
885
    private TaxonName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
886
                                            String authorStr, String nomRefTitle) {
887

    
888
        TaxonName taxonName;// cache field for the taxonName.titleCache
889
        String taxonNameTitleCache = null;
890
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
891

    
892
        // TitleCache preprocessing
893
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
894
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
895
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
896
            if(authorStr != null) {
897
                authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
898
            }
899
        }
900

    
901
        // parse the full taxon name
902
        if(!StringUtils.isEmpty(nomRefTitle)){
903
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
904
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
905
            logger.debug(":::::" + taxonFullNameStr);
906
            taxonName = nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
907
        } else {
908
            taxonName = (TaxonName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
909
        }
910

    
911
        taxonNameTitleCache = taxonName.getTitleCache().trim();
912
        if (taxonName.isProtectedTitleCache()) {
913
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
914
        } else {
915

    
916
            boolean doRestoreTitleCacheStr = false;
917

    
918
            // Check if titleCache and nameCache are plausible
919
            String titleCacheCompareStr = titleCacheStr;
920
            String nameCache = taxonName.getNameCache();
921
            String nameCompareStr = nameStr;
922
            if(taxonName.isBinomHybrid()){
923
                titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
924
                nameCompareStr = nameCompareStr.replace(" x ", " ×");
925
            }
926
            if(taxonName.isMonomHybrid()){
927
                titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
928
                nameCompareStr = nameCompareStr.replace("^X ", "× ");
929
            }
930
            if(authorStr != null && authorStr.contains(" et ")){
931
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
932
            }
933
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
934
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
935
                doRestoreTitleCacheStr = true;
936
            }
937
            if (!nameCache.trim().equals(nameCompareStr)) {
938
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
939
            }
940

    
941
            //  Author
942
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
943
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
944
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
945
            //    doRestoreTitleCacheStr = true;
946
            //}
947

    
948
            if(doRestoreTitleCacheStr){
949
                taxonName.setTitleCache(titleCacheStr, true);
950
            }
951

    
952
            // deduplicate
953
            replaceAuthorNamesAndNomRef(state, taxonName);
954
        }
955

    
956
        // Annotations
957
        if(!nameAnnotations.isEmpty()){
958
            for(String text : nameAnnotations.keySet()){
959
                taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
960
            }
961
        }
962

    
963
        taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
964

    
965
        getNameService().save(taxonName);
966

    
967
        return taxonName;
968
    }
969

    
970
    /**
971
     * @param state
972
     * @return
973
     */
974
    private TaxonNode getClassificationRootNode(IAPTImportState state) {
975

    
976
     //   Classification classification = state.getClassification();
977
     //   if (classification == null){
978
     //       IAPTImportConfigurator config = state.getConfig();
979
     //       classification = Classification.NewInstance(state.getConfig().getClassificationName());
980
     //       classification.setUuid(config.getClassificationUuid());
981
     //       classification.setReference(config.getSecReference());
982
     //       classification = getClassificationService().find(state.getConfig().getClassificationUuid());
983
     //   }
984
        TaxonNode rootNode = state.getRootNode();
985
        if (rootNode == null){
986
            rootNode = getTaxonNodeService().find(ROOT_UUID);
987
        }
988
        if (rootNode == null){
989
            Classification classification = state.getClassification();
990
            if (classification == null){
991
                Reference sec = state.getSecReference();
992
                String classificationName = state.getConfig().getClassificationName();
993
                Language language = Language.DEFAULT();
994
                classification = Classification.NewInstance(classificationName, sec, language);
995
                state.setClassification(classification);
996
                classification.setUuid(state.getConfig().getClassificationUuid());
997
                classification.getRootNode().setUuid(ROOT_UUID);
998
                getClassificationService().save(classification);
999
            }
1000
            rootNode = classification.getRootNode();
1001
            state.setRootNode(rootNode);
1002
        }
1003
        return rootNode;
1004
    }
1005

    
1006
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
1007

    
1008
        Collection superCollection = null;
1009
        if(subCollectionStr != null){
1010
            superCollection = getCollection(collectionCode, instituteStr, null);
1011
            collectionCode = subCollectionStr;
1012
            instituteStr = null;
1013
        }
1014

    
1015
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
1016

    
1017
        Collection collection = collectionMap.get(key);
1018

    
1019
        if(collection == null) {
1020
            collection = Collection.NewInstance();
1021
            collection.setCode(collectionCode);
1022
            if(instituteStr != null){
1023
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
1024
            }
1025
            if(superCollection != null){
1026
                collection.setSuperCollection(superCollection);
1027
            }
1028
            collectionMap.put(key, collection);
1029
            if(!_testMode) {
1030
                getCollectionService().save(collection);
1031
            }
1032
        }
1033

    
1034
        return collection;
1035
    }
1036

    
1037

    
1038
    /**
1039
     * @param record
1040
     * @param originalKey
1041
     * @param doUnescapeHtmlEntities
1042
     * @return
1043
     */
1044
    private String getValue(Map<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
1045
        String value = record.get(originalKey);
1046

    
1047
        value = fixCharacters(value);
1048

    
1049
        if (! StringUtils.isBlank(value)) {
1050
        	if (logger.isDebugEnabled()) {
1051
        	    logger.debug(originalKey + ": " + value);
1052
        	}
1053
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1054
            if(doUnescapeHtmlEntities){
1055
                value = StringEscapeUtils.unescapeHtml(value);
1056
            }
1057
        	return value.trim();
1058
        }else{
1059
        	return null;
1060
        }
1061
    }
1062

    
1063
    /**
1064
     * Fixes broken characters.
1065
     * For details see
1066
     * https://dev.e-taxonomy.eu/redmine/issues/6035
1067
     *
1068
     * @param value
1069
     * @return
1070
     */
1071
    private String fixCharacters(String value) {
1072

    
1073
        value = StringUtils.replace(value, "s$K", "š");
1074
        value = StringUtils.replace(value, "n$K", "ň");
1075
        value = StringUtils.replace(value, "e$K", "ě");
1076
        value = StringUtils.replace(value, "r$K", "ř");
1077
        value = StringUtils.replace(value, "c$K", "č");
1078
        value = StringUtils.replace(value, "z$K", "ž");
1079
        value = StringUtils.replace(value, "S>U$K", "Š");
1080
        value = StringUtils.replace(value, "C>U$K", "Č");
1081
        value = StringUtils.replace(value, "R>U$K", "Ř");
1082
        value = StringUtils.replace(value, "Z>U$K", "Ž");
1083
        value = StringUtils.replace(value, "g$K", "ǧ");
1084
        value = StringUtils.replace(value, "s$A", "ś");
1085
        value = StringUtils.replace(value, "n$A", "ń");
1086
        value = StringUtils.replace(value, "c$A", "ć");
1087
        value = StringUtils.replace(value, "e$E", "ę");
1088
        value = StringUtils.replace(value, "o$H", "õ");
1089
        value = StringUtils.replace(value, "s$C", "ş");
1090
        value = StringUtils.replace(value, "t$C", "ț");
1091
        value = StringUtils.replace(value, "S>U$C", "Ş");
1092
        value = StringUtils.replace(value, "a$O", "å");
1093
        value = StringUtils.replace(value, "A>U$O", "Å");
1094
        value = StringUtils.replace(value, "u$O", "ů");
1095
        value = StringUtils.replace(value, "g$B", "ğ");
1096
        value = StringUtils.replace(value, "g$B", "ĕ");
1097
        value = StringUtils.replace(value, "a$B", "ă");
1098
        value = StringUtils.replace(value, "l$/", "ł");
1099
        value = StringUtils.replace(value, ">i", "ı");
1100
        value = StringUtils.replace(value, "i$U", "ï");
1101
        // Special-cases
1102
        value = StringUtils.replace(value, "&yacute", "ý");
1103
        value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1104
        value = StringUtils.replace(value, "E>U$D", "З");
1105
        value = StringUtils.replace(value, "S>U$E", "Ş");
1106
        value = StringUtils.replace(value, "s$E", "ş");
1107

    
1108
        value = StringUtils.replace(value, "c$k", "č");
1109
        value = StringUtils.replace(value, " U$K", " Š");
1110

    
1111
        value = StringUtils.replace(value, "O>U>!", "Ø");
1112
        value = StringUtils.replace(value, "o>!", "ø");
1113
        value = StringUtils.replace(value, "S$K", "Ŝ");
1114
        value = StringUtils.replace(value, ">l", "ğ");
1115

    
1116
        value = StringUtils.replace(value, "§B>i", "ł");
1117
        value = StringUtils.replace(value, "¤", "ń");
1118

    
1119
        return value;
1120
    }
1121

    
1122

    
1123
    /**
1124
	 *  Stores taxa records in DB
1125
	 */
1126
	@Override
1127
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1128

    
1129
        if(excludeFromImport(state)){
1130
            return;
1131
        }
1132

    
1133
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
1134
        LogUtils.setLevel(logger, Level.DEBUG);
1135
        Map<String, String> record = state.getOriginalRecord();
1136
        logger.debug(lineNumber + record.toString());
1137

    
1138
        Set<String> keys = record.keySet();
1139
        for (String key: keys) {
1140
            if (! expectedKeys.contains(key)){
1141
                logger.warn(lineNumber + "Unexpected Key: " + key);
1142
            }
1143
        }
1144

    
1145
        String reg_id = record.get(REGISTRATIONNO_PK);
1146

    
1147
        //higherTaxon
1148
        String higherTaxaString = record.get(HIGHERTAXON);
1149
        boolean isFossil = false;
1150
        if(higherTaxaString.startsWith("FOSSIL ")){
1151
            higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1152
            isFossil = true;
1153
        }
1154
        TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1155

    
1156
       //Taxon
1157
        Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1158
        if (taxon == null){
1159
            logger.warn(lineNumber + "taxon could not be created and is null");
1160
            return;
1161
        }
1162
        ((IAPTImportState)state).setCurrentTaxon(taxon);
1163

    
1164
        // Registration
1165
        IAPTRegData regData = makeIAPTRegData(state);
1166
        ObjectMapper mapper = new ObjectMapper();
1167
        try {
1168
            String regdataJson = mapper.writeValueAsString(regData);
1169
            Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1170
            getNameService().save(taxon.getName());
1171
        } catch (JsonProcessingException e) {
1172
            logger.error("Error on converting IAPTRegData", e);
1173
        }
1174

    
1175
        logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1176
		return;
1177
    }
1178

    
1179
    private boolean excludeFromImport(SimpleExcelTaxonImportState<CONFIG> state) {
1180
        if(state.getConfig().isDoAlgeaeOnly()){
1181
            boolean include = false;
1182
            String higherTaxon = getValue(state.getOriginalRecord(), HIGHERTAXON, true);
1183
            String fullNameStr = getValue(state.getOriginalRecord(), FULLNAME, true);
1184
            include |= higherTaxon.matches(".*?PHYCEAE(?:$|\\s+)");
1185
            for(String test : new String[]{
1186
                    "Bolidophyceae ",
1187
                    "Phaeothamniophyceae ",
1188
                    "Bolidomonadales ",
1189
                    "Bolidomonadaceae ",
1190
                    "Aureoumbra ",
1191
                    "Bolidomonas ",
1192
                    "Seagriefia ",
1193
                    "Navicula "
1194
                }) {
1195
                include |= fullNameStr.startsWith(test);
1196
            }
1197
            return !include;
1198
        }
1199

    
1200
        return false;
1201
    }
1202

    
1203
    private ExtensionType getExtensionTypeIAPTRegData() {
1204
        if(extensionTypeIAPTRegData == null){
1205
            extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1206
            getTermService().save(extensionTypeIAPTRegData);
1207
        }
1208
        return extensionTypeIAPTRegData;
1209
    }
1210

    
1211
    private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1212

    
1213
        Map<String, String> record = state.getOriginalRecord();
1214
        String registrationStr = getValue(record, REGISTRATION);
1215
        String regDateStr = getValue(record, REGDATE);
1216
        String regStr = getValue(record, REGISTRATION, true);
1217

    
1218
        String dateStr = null;
1219
        String office = null;
1220
        Integer regID = null;
1221
        Integer formNo = null;
1222

    
1223
        Matcher m = registrationPattern.matcher(registrationStr);
1224
        if(m.matches()){
1225
            dateStr = m.group("regdate");
1226
            if(parseDate( regStr, dateStr) == null){
1227
                // check for valid dates
1228
                logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1229
            }
1230
            office = m.group("office");
1231
            regID = Integer.valueOf(m.group("regid"));
1232
            try {
1233
                formNo = Integer.valueOf(m.group("formNo"));
1234
            } catch(IllegalArgumentException e){
1235
                // ignore
1236
            }
1237
        } else {
1238
            logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1239
        }
1240
        IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1241
        return regData;
1242
    }
1243

    
1244
    private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1245
        String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1246
        TaxonNode higherTaxonNode = null;
1247

    
1248
        ITaxonTreeNode rootNode = getClassificationRootNode(state);
1249
        for (String htn :  higherTaxaNames) {
1250
            htn = StringUtils.capitalize(htn.trim());
1251
            Taxon higherTaxon = state.getHigherTaxon(htn);
1252
            if (higherTaxon != null){
1253
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1254
            }else{
1255
                IBotanicalName name = makeHigherTaxonName(state, htn);
1256
                Reference sec = state.getSecReference();
1257
                higherTaxon = Taxon.NewInstance(name, sec);
1258
                getTaxonService().save(higherTaxon);
1259
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1260
                state.putHigherTaxon(htn, higherTaxon);
1261
                getClassificationService().saveTreeNode(higherTaxonNode);
1262
            }
1263
            rootNode = higherTaxonNode;
1264
        }
1265
        return higherTaxonNode;
1266
    }
1267

    
1268
    private IBotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1269

    
1270
        Rank rank = guessRank(name);
1271

    
1272
        IBotanicalName taxonName = TaxonNameFactory.NewBotanicalInstance(rank);
1273
        taxonName.addSource(makeOriginalSource(state));
1274
        taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1275
        return taxonName;
1276
    }
1277

    
1278
    private Rank guessRank(String name) {
1279

    
1280
        // normalize
1281
        name = name.replaceAll("\\(.*\\)", "").trim();
1282

    
1283
        if(name.matches("^Plantae$|^Fungi$")){
1284
           return Rank.KINGDOM();
1285
        } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1286
           return rankFamilyIncertisSedis();
1287
        } else if(name.matches(".*phyta$|.*mycota$")){
1288
           return Rank.PHYLUM();
1289
        } else if(name.matches(".*phytina$|.*mycotina$")){
1290
           return Rank.SUBPHYLUM();
1291
        } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1292
            return rankUnrankedSupraGeneric();
1293
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1294
           return Rank.CLASS();
1295
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1296
           return Rank.SUBCLASS();
1297
        } else if(name.matches(".*ales$")){
1298
           return Rank.ORDER();
1299
        } else if(name.matches(".*ineae$")){
1300
           return Rank.SUBORDER();
1301
        } else if(name.matches(".*aceae$")){
1302
            return Rank.FAMILY();
1303
        } else if(name.matches(".*oideae$")){
1304
           return Rank.SUBFAMILY();
1305
        } else
1306
        //    if(name.matches(".*eae$")){
1307
        //    return Rank.TRIBE();
1308
        // } else
1309
            if(name.matches(".*inae$")){
1310
           return Rank.SUBTRIBE();
1311
        } else if(name.matches(".*ae$")){
1312
           return Rank.FAMILY();
1313
        }
1314
        return Rank.UNKNOWN_RANK();
1315
    }
1316

    
1317
    private Rank rankUnrankedSupraGeneric() {
1318

    
1319
        if(rankUnrankedSupraGeneric == null){
1320
            rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1321
            getTermService().save(rankUnrankedSupraGeneric);
1322
        }
1323
        return rankUnrankedSupraGeneric;
1324
    }
1325

    
1326
    private Rank rankFamilyIncertisSedis() {
1327

    
1328
        if(familyIncertisSedis == null){
1329
            familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1330
            getTermService().save(familyIncertisSedis);
1331
        }
1332
        return familyIncertisSedis;
1333
    }
1334

    
1335
    private AnnotationType annotationTypeCaveats(){
1336
        if(annotationTypeCaveats == null){
1337
            annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1338
            getTermService().save(annotationTypeCaveats);
1339
        }
1340
        return annotationTypeCaveats;
1341
    }
1342

    
1343

    
1344
    /**
1345
     * @param state
1346
     * @return
1347
     */
1348
    private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1349
        return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1350
    }
1351

    
1352

    
1353
    private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1354
        Reference ref = state.getReference(uuidRef);
1355
        if (ref == null){
1356
            ref = getReferenceService().find(uuidRef);
1357
            state.putReference(uuidRef, ref);
1358
        }
1359
        return ref;
1360
    }
1361

    
1362
    private MarkerType markerTypeFossil(){
1363
        if(this.markerTypeFossil == null){
1364
            markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1365
            getTermService().save(this.markerTypeFossil);
1366
        }
1367
        return markerTypeFossil;
1368
    }
1369

    
1370
    private MarkerType markerDuplicateRegistration(){
1371
        if(this.duplicateRegistration == null){
1372
            duplicateRegistration = MarkerType.NewInstance("duplicateRegistration", "duplicateRegistration", null);
1373
            getTermService().save(this.duplicateRegistration);
1374
        }
1375
        return markerTypeFossil;
1376
    }
1377

    
1378
    private String csvReportLine(String regId, String message, String ... fields){
1379
        StringBuilder out = new StringBuilder("regID#");
1380
        out.append(regId).append(",\"").append(message).append('"');
1381

    
1382
        for(String f : fields){
1383
            out.append(",\"").append(f).append('"');
1384
        }
1385
        return out.toString();
1386
    }
1387

    
1388

    
1389
}
(1-1/8)