adapt app-import to latest changes
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / phycobank / IAPTExcelImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.phycobank;
11
12 import java.util.ArrayList;
13 import java.util.Arrays;
14 import java.util.HashMap;
15 import java.util.HashSet;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.Set;
19 import java.util.UUID;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23 import org.apache.commons.lang.ArrayUtils;
24 import org.apache.commons.lang.StringEscapeUtils;
25 import org.apache.commons.lang.StringUtils;
26 import org.apache.logging.log4j.Level;
27 import org.apache.logging.log4j.LogManager;
28 import org.apache.logging.log4j.Logger;
29 import org.joda.time.DateTimeFieldType;
30 import org.joda.time.Partial;
31 import org.joda.time.format.DateTimeFormat;
32 import org.joda.time.format.DateTimeFormatter;
33 import org.springframework.stereotype.Component;
34
35 import com.fasterxml.jackson.core.JsonProcessingException;
36 import com.fasterxml.jackson.databind.ObjectMapper;
37
38 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
39 import eu.etaxonomy.cdm.common.CdmUtils;
40 import eu.etaxonomy.cdm.common.LogUtils;
41 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
42 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
43 import eu.etaxonomy.cdm.model.agent.Institution;
44 import eu.etaxonomy.cdm.model.agent.Person;
45 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
46 import eu.etaxonomy.cdm.model.common.Annotation;
47 import eu.etaxonomy.cdm.model.common.AnnotationType;
48 import eu.etaxonomy.cdm.model.common.Extension;
49 import eu.etaxonomy.cdm.model.common.ExtensionType;
50 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
51 import eu.etaxonomy.cdm.model.common.Language;
52 import eu.etaxonomy.cdm.model.common.LanguageString;
53 import eu.etaxonomy.cdm.model.common.Marker;
54 import eu.etaxonomy.cdm.model.common.MarkerType;
55 import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
56 import eu.etaxonomy.cdm.model.name.IBotanicalName;
57 import eu.etaxonomy.cdm.model.name.NameRelationshipType;
58 import eu.etaxonomy.cdm.model.name.NameTypeDesignation;
59 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
60 import eu.etaxonomy.cdm.model.name.NomenclaturalStatus;
61 import eu.etaxonomy.cdm.model.name.NomenclaturalStatusType;
62 import eu.etaxonomy.cdm.model.name.Rank;
63 import eu.etaxonomy.cdm.model.name.RankClass;
64 import eu.etaxonomy.cdm.model.name.SpecimenTypeDesignationStatus;
65 import eu.etaxonomy.cdm.model.name.TaxonName;
66 import eu.etaxonomy.cdm.model.name.TaxonNameFactory;
67 import eu.etaxonomy.cdm.model.occurrence.Collection;
68 import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
69 import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
70 import eu.etaxonomy.cdm.model.occurrence.GatheringEvent;
71 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
72 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
73 import eu.etaxonomy.cdm.model.reference.Reference;
74 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
75 import eu.etaxonomy.cdm.model.taxon.Classification;
76 import eu.etaxonomy.cdm.model.taxon.ITaxonTreeNode;
77 import eu.etaxonomy.cdm.model.taxon.Synonym;
78 import eu.etaxonomy.cdm.model.taxon.SynonymType;
79 import eu.etaxonomy.cdm.model.taxon.Taxon;
80 import eu.etaxonomy.cdm.model.taxon.TaxonNode;
81 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
82 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
83
84 /**
85 * @author a.mueller
86 * @since 05.01.2016
87 */
88
89 @Component("iAPTExcelImport")
90 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
91 private static final long serialVersionUID = -747486709409732371L;
92 private static final Logger logger = LogManager.getLogger();
93
94 public static final String ANNOTATION_MARKER_STRING = "[*]";
95
96 private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
97
98 private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
99
100 private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
101 private final static String HIGHERTAXON= "HigherTaxon";
102 private final static String FULLNAME= "FullName";
103 private final static String AUTHORSSPELLING= "AuthorsSpelling";
104 private final static String LITSTRING= "LitString";
105 private final static String REGISTRATION= "Registration";
106 private final static String TYPE= "Type";
107 private final static String CAVEATS= "Caveats";
108 private final static String FULLBASIONYM= "FullBasionym";
109 private final static String FULLSYNSUBST= "FullSynSubst";
110 private final static String NOTESTXT= "NotesTxt";
111 private final static String REGDATE= "RegDate";
112 private final static String NAMESTRING= "NameString";
113 private final static String BASIONYMSTRING= "BasionymString";
114 private final static String SYNSUBSTSTR= "SynSubstStr";
115 private final static String AUTHORSTRING= "AuthorString";
116
117 private static List<String> expectedKeys= Arrays.asList(new String[]{
118 REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
119
120 private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
121 private static final Pattern[] datePatterns = new Pattern[]{
122 // NOTE:
123 // The order of the patterns is extremely important!!!
124 //
125 // all patterns cover the years 1700 - 1999
126 Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
127 Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
128 Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
129 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
130 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
131 Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
132 Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
133 Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
134 Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
135 Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
136 };
137 protected static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
138
139 private static final Pattern typeNameBasionymPattern = Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
140 private static final Pattern typeNameNotePattern = Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
141 private static final Pattern typeNameSpecialSplitPattern = Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
142
143 protected static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
144 private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
145 private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
146
147 // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
148 private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
149
150 private static final Pattern[] specimenTypePatterns = new Pattern[]{
151 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
152 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
153 Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
154 Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
155 Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
156 };
157
158
159 private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
160
161 private static Map<String, Integer> monthFromNameMap = new HashMap<>();
162
163 static {
164 String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
165 String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
166 String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
167 String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
168 String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
169 String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
170 String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
171 String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
172 String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
173 String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
174
175 String[][] perLang = new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
176
177 for (String[] months: perLang) {
178 for(int m = 1; m < 13; m++){
179 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
180 }
181 }
182
183 // special cases
184 monthFromNameMap.put("mar", 3);
185 monthFromNameMap.put("dec", 12);
186 monthFromNameMap.put("februari", 2);
187 monthFromNameMap.put("març", 3);
188 }
189
190
191 DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
192
193 private Map<String, Collection> collectionMap = new HashMap<>();
194
195 private ExtensionType extensionTypeIAPTRegData = null;
196
197 private Set<String> nameSet = new HashSet<>();
198 private DefinedTermBase duplicateRegistration = null;
199
200 enum TypesName {
201 fieldUnit, holotype, isotype;
202
203 public SpecimenTypeDesignationStatus status(){
204 switch (this) {
205 case holotype:
206 return SpecimenTypeDesignationStatus.HOLOTYPE();
207 case isotype:
208 return SpecimenTypeDesignationStatus.ISOTYPE();
209 default:
210 return null;
211 }
212 }
213 }
214
215 private MarkerType markerTypeFossil = null;
216 private Rank rankUnrankedSupraGeneric = null;
217 private Rank familyIncertisSedis = null;
218 private AnnotationType annotationTypeCaveats = null;
219
220 private Reference bookVariedadesTradicionales = null;
221
222 /**
223 * HACK for unit simple testing
224 */
225 boolean _testMode = System.getProperty("TEST_MODE") != null;
226
227 private Taxon makeTaxon(Map<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
228 TaxonNode higherTaxonNode, boolean isFossil) {
229
230 String regNumber = getValue(record, REGISTRATIONNO_PK, false);
231 String regStr = getValue(record, REGISTRATION, true);
232 String titleCacheStr = getValue(record, FULLNAME, true);
233 String nameStr = getValue(record, NAMESTRING, true);
234 String authorStr = getValue(record, AUTHORSTRING, true);
235 String nomRefStr = getValue(record, LITSTRING, true);
236 String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
237 String notesTxt = getValue(record, NOTESTXT, true);
238 String caveats = getValue(record, CAVEATS, true);
239 String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
240 String fullBasionymStr = getValue(record, FULLBASIONYM, true);
241 String basionymNameStr = getValue(record, FULLBASIONYM, true);
242 String synSubstStr = getValue(record, SYNSUBSTSTR, true);
243 String typeStr = getValue(record, TYPE, true);
244
245
246 String nomRefTitle = null;
247 String nomRefDetail;
248 String nomRefPupDate = null;
249 String nomRefIssue = null;
250 Partial pupDate = null;
251
252 boolean restoreOriginalReference = false;
253 boolean nameIsValid = true;
254
255 // preprocess nomRef: separate citation, reference detail, publishing date
256 if(!StringUtils.isEmpty(nomRefStr)){
257 nomRefStr = nomRefStr.trim();
258
259 // handle the special case which is hard to parse:
260 //
261 // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
262 if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
263
264 if(bookVariedadesTradicionales == null){
265 bookVariedadesTradicionales = ReferenceFactory.newBook();
266 bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
267 bookVariedadesTradicionales.setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(1997));
268 getReferenceService().save(bookVariedadesTradicionales);
269 }
270 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
271 restoreOriginalReference = true;
272 }
273
274 Matcher m = nomRefTokenizeP.matcher(nomRefStr);
275 if(m.matches()){
276 nomRefTitle = m.group("title");
277 nomRefDetail = m.group("detail");
278 nomRefPupDate = m.group("date").trim();
279 nomRefIssue = m.group("issue");
280
281 pupDate = parseDate(regNumber, nomRefPupDate);
282 if (pupDate != null) {
283 nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
284 } else {
285 logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
286 }
287 } else {
288 nomRefTitle = nomRefStr;
289 }
290 }
291
292 TaxonName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
293
294 // always add the original strings of parsed data as annotation
295 taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
296 "\n - '" + LITSTRING + "': "+ nomRefStr +
297 "\n - '" + TYPE + "': " + typeStr +
298 "\n - '" + REGISTRATION + "': " + regStr
299 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
300
301 if(restoreOriginalReference){
302 taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
303 }
304
305 if(taxonName.getNomenclaturalReference() != null){
306 if(pupDate != null) {
307 taxonName.getNomenclaturalReference().setDatePublished(VerbatimTimePeriod.NewVerbatimInstance(pupDate));
308 }
309 if(nomRefIssue != null) {
310 taxonName.getNomenclaturalReference().setVolume(nomRefIssue);
311 }
312 }
313
314
315 if(!StringUtils.isEmpty(notesTxt)){
316 notesTxt = notesTxt.replace("Notes: ", "").trim();
317 taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
318 nameIsValid = false;
319
320 }
321 if(!StringUtils.isEmpty(caveats)){
322 caveats = caveats.replace("Caveats: ", "").trim();
323 taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
324 nameIsValid = false;
325 }
326
327 if(nameIsValid){
328 // Status is always considered valid if no notes and cavets are set
329 taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
330 }
331
332 getNameService().save(taxonName);
333
334 // Namerelations
335 if(!StringUtils.isEmpty(authorsSpelling)){
336 authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
337
338 String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
339 String[] nameStrTokens = StringUtils.split(nameStr, " ");
340
341 ArrayUtils.reverse(authorSpellingTokens);
342 ArrayUtils.reverse(nameStrTokens);
343
344 for (int i = 0; i < nameStrTokens.length; i++){
345 if(i < authorSpellingTokens.length){
346 nameStrTokens[i] = authorSpellingTokens[i];
347 }
348 }
349 ArrayUtils.reverse(nameStrTokens);
350
351 String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
352 // build the fullnameString of the misspelled name
353 misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
354
355 TaxonName misspelledName = nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
356 misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null, null);
357 getNameService().save(misspelledName);
358 }
359
360 // Replaced Synonyms
361 if(!StringUtils.isEmpty(fullSynSubstStr)){
362 fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
363 TaxonName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
364 replacedSynonymName.addReplacedSynonym(taxonName, null, null, null, null);
365 getNameService().save(replacedSynonymName);
366 }
367
368 Reference sec = state.getConfig().getSecReference();
369 Taxon taxon = Taxon.NewInstance(taxonName, sec);
370
371 // Basionym
372 if(fullBasionymStr != null){
373 fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
374 basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
375 TaxonName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
376 getNameService().save(basionym);
377 taxonName.addBasionym(basionym);
378
379 Synonym syn = Synonym.NewInstance(basionym, sec);
380 taxon.addSynonym(syn, SynonymType.HOMOTYPIC_SYNONYM_OF());
381 getTaxonService().save(syn);
382 }
383
384 // Markers
385 if(isFossil){
386 taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
387 }
388 if(!nameSet.add(titleCacheStr)){
389 taxonName.addMarker(Marker.NewInstance(markerDuplicateRegistration(), true));
390 logger.warn(csvReportLine(regNumber, "Duplicate registration of", titleCacheStr));
391 }
392
393
394 // Types
395 if(!StringUtils.isEmpty(typeStr)){
396
397 if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
398 makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
399 } else {
400 makeNameTypeData(typeStr, taxonName, regNumber, state);
401 }
402 }
403
404 getTaxonService().save(taxon);
405
406 if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
407 // try to find the genus, it should have been imported already, Genera are coming first in the import file
408 Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
409 if(genus != null){
410 higherTaxonNode = genus.getTaxonNodes().iterator().next();
411 } else {
412 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
413 }
414 }
415
416 if(higherTaxonNode != null){
417 higherTaxonNode.addChildTaxon(taxon, null, null);
418 getTaxonNodeService().save(higherTaxonNode);
419 }
420
421 if(taxonName.getRank().isGenus()){
422 ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
423 }
424
425 return taxon;
426 }
427
428 private void makeSpecimenTypeData(String typeStr, TaxonName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
429
430 Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
431
432 if(m.matches()){
433 String fieldUnitStr = m.group(TypesName.fieldUnit.name());
434 // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
435 FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
436 if(fieldUnit == null) {
437 // create a field unit with only a titleCache using the fieldUnitStr substring
438 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
439 fieldUnit = FieldUnit.NewInstance();
440 fieldUnit.setTitleCache(fieldUnitStr, true);
441 getOccurrenceService().save(fieldUnit);
442 }
443 getOccurrenceService().save(fieldUnit);
444
445 SpecimenOrObservationType specimenType;
446 if(isFossil){
447 specimenType = SpecimenOrObservationType.Fossil;
448 } else {
449 specimenType = SpecimenOrObservationType.PreservedSpecimen;
450 }
451
452 // all others ..
453 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
454 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
455
456 } else {
457 // create a field unit with only a titleCache using the full typeStr
458 FieldUnit fieldUnit = FieldUnit.NewInstance();
459 fieldUnit.setTitleCache(typeStr, true);
460 getOccurrenceService().save(fieldUnit);
461 logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
462 }
463 getNameService().save(taxonName);
464 }
465
466 private void makeNameTypeData(String typeStr, IBotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
467
468 String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
469 if(nameStr.isEmpty()) {
470 return;
471 }
472
473 String basionymNameStr = null;
474 String noteStr = null;
475 String agentStr = null;
476
477 Matcher m;
478
479 if(typeStr.startsWith("not to be indicated")){
480 // Special case:
481 // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
482 // FIXME
483 m = typeNameSpecialSplitPattern.matcher(nameStr);
484 if(m.matches()){
485 nameStr = m.group("name");
486 noteStr = m.group("note");
487 agentStr = m.group("agent");
488 // TODO better import of agent?
489 if(agentStr != null){
490 noteStr = noteStr + ": " + agentStr;
491 }
492 }
493 } else {
494 // Generic case
495 m = typeNameBasionymPattern.matcher(nameStr);
496 if (m.find()) {
497 basionymNameStr = m.group("basionymName");
498 if (basionymNameStr != null) {
499 nameStr = nameStr.replace(m.group(0), "");
500 }
501 }
502
503 m = typeNameNotePattern.matcher(nameStr);
504 if (m.find()) {
505 noteStr = m.group(1);
506 if (noteStr != null) {
507 nameStr = nameStr.replace(m.group(0), "");
508 }
509 }
510 }
511
512 TaxonName typeName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
513
514 if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
515 logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
516 }
517
518 if(basionymNameStr != null){
519 TaxonName basionymName = (TaxonName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
520 getNameService().save(basionymName);
521 typeName.addBasionym(basionymName);
522 }
523
524
525 NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
526 nameTypeDesignation.setTypeName(typeName);
527 getNameService().save(typeName);
528
529 if(noteStr != null){
530 nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
531 }
532 taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
533
534 }
535
536 /**
537 * Currently only parses the collector, fieldNumber and the collection date.
538 *
539 * @param fieldUnitStr
540 * @param regNumber
541 * @param state
542 * @return null if the fieldUnitStr could not be parsed
543 */
544 private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
545
546 FieldUnit fieldUnit = null;
547
548 Matcher m1 = collectorPattern.matcher(fieldUnitStr);
549 if(m1.matches()){
550
551 String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
552 String removal = m1.group(1);
553 if(collectorData == null){
554 collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
555 removal = m1.group(3);
556 }
557 if(collectorData == null){
558 collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
559 removal = null;
560 }
561 if(collectorData == null){
562 return null;
563 }
564
565 // the fieldUnitStr is parsable
566 // remove all collectorData from the fieldUnitStr and use the rest as locality
567 String locality = null;
568 if(removal != null){
569 locality = fieldUnitStr.replace(removal, "");
570 }
571
572 String collectorStr = null;
573 String detailStr = null;
574 Partial date = null;
575 String fieldNumber = null;
576
577 Matcher m2 = collectionDataPattern.matcher(collectorData);
578 if(m2.matches()){
579 collectorStr = m2.group("collector");
580 detailStr = m2.group("detail");
581
582 // Try to make sense of the detailStr
583 if(detailStr != null){
584 detailStr = detailStr.trim();
585 // 1. try to parse as date
586 date = parseDate(regNumber, detailStr);
587 if(date == null){
588 // 2. try to parse as number
589 if(collectorsNumber.matcher(detailStr).matches()){
590 fieldNumber = detailStr;
591 }
592 }
593 }
594 if(date == null && fieldNumber == null){
595 // detailed parsing not possible, so need fo fallback
596 collectorStr = collectorData;
597 }
598 }
599
600 if(collectorStr == null) {
601 collectorStr = collectorData;
602 }
603
604 fieldUnit = FieldUnit.NewInstance();
605 GatheringEvent ge = GatheringEvent.NewInstance();
606 if(locality != null){
607 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
608 }
609
610 TeamOrPersonBase agent = state.getAgentBase(collectorStr);
611 if(agent == null) {
612 agent = Person.NewTitledInstance(collectorStr);
613 getAgentService().save(agent);
614 state.putAgentBase(collectorStr, agent);
615 }
616 ge.setCollector(agent);
617
618 if(date != null){
619 ge.setGatheringDate(date);
620 }
621
622 getEventBaseService().save(ge);
623 fieldUnit.setGatheringEvent(ge);
624
625 if(fieldNumber != null) {
626 fieldUnit.setFieldNumber(fieldNumber);
627 }
628 getOccurrenceService().save(fieldUnit);
629
630 }
631
632 return fieldUnit;
633 }
634
635 protected Partial parseDate(String regNumber, String dateStr) {
636
637 Partial pupDate = null;
638 boolean parseError = false;
639
640 String day = null;
641 String month = null;
642 String monthName = null;
643 String year = null;
644
645 for(Pattern p : datePatterns){
646 Matcher m2 = p.matcher(dateStr);
647 if(m2.matches()){
648 try {
649 year = m2.group("year");
650 } catch (IllegalArgumentException e){
651 // named capture group not found
652 }
653 try {
654 month = m2.group("month");
655 } catch (IllegalArgumentException e){
656 // named capture group not found
657 }
658
659 try {
660 monthName = m2.group("monthName");
661 month = monthFromName(monthName, regNumber);
662 if(month == null){
663 parseError = true;
664 }
665 } catch (IllegalArgumentException e){
666 // named capture group not found
667 }
668 try {
669 day = m2.group("day");
670 } catch (IllegalArgumentException e){
671 // named capture group not found
672 }
673
674 if(year != null){
675 if (year.length() == 2) {
676 // it is an abbreviated year from the 19** years
677 year = "19" + year;
678 }
679 break;
680 } else {
681 parseError = true;
682 }
683 }
684 }
685 if(year == null){
686 parseError = true;
687 }
688 List<DateTimeFieldType> types = new ArrayList<>();
689 List<Integer> values = new ArrayList<>();
690 if(!parseError) {
691 types.add(DateTimeFieldType.year());
692 values.add(Integer.parseInt(year));
693 if (month != null) {
694 types.add(DateTimeFieldType.monthOfYear());
695 values.add(Integer.parseInt(month));
696 }
697 if (day != null) {
698 types.add(DateTimeFieldType.dayOfMonth());
699 values.add(Integer.parseInt(day));
700 }
701 pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
702 }
703 return pupDate;
704 }
705
706 private String monthFromName(String monthName, String regNumber) {
707
708 Integer month = monthFromNameMap.get(monthName.toLowerCase());
709 if(month == null){
710 logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
711 return null;
712 } else {
713 return month.toString();
714 }
715 }
716
717
718 private void addSpecimenTypes(IBotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
719
720 if(StringUtils.isEmpty(typeStr)){
721 return;
722 }
723 typeStr = typeStr.trim().replaceAll("\\.$", "");
724
725 Collection collection = null;
726 DerivedUnit specimen = null;
727
728 List<DerivedUnit> specimens = new ArrayList<>();
729 if(multiple){
730 String[] tokens = typeStr.split("\\s?,\\s?");
731 for (String t : tokens) {
732 // command to list all complex parsabel types:
733 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
734 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
735
736 if(!t.isEmpty()){
737 // trying to parse the string
738 specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
739 if(specimen != null){
740 specimens.add(specimen);
741 } else {
742 // parsing was not successful make simple specimen
743 specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
744 }
745 }
746 }
747 } else {
748 specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
749 if(specimen != null) {
750 specimens.add(specimen);
751 // remember current collection
752 collection = specimen.getCollection();
753 } else {
754 // parsing was not successful make simple specimen
755 specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
756 }
757 }
758
759 for(DerivedUnit s : specimens){
760 taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
761 }
762 }
763
764 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
765 DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
766 facade.setTitleCache(titleCache.trim(), true);
767 specimen = facade.innerDerivedUnit();
768 return specimen;
769 }
770
771 /**
772 *
773 * @param fieldUnit
774 * @param typeName
775 * @param collection
776 * @param text
777 * @param regNumber
778 * @return
779 */
780 protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
781
782 DerivedUnit specimen = null;
783
784 String collectionCode = null;
785 String subCollectionStr = null;
786 String instituteStr = null;
787 String accessionNumber = null;
788
789 boolean unusualAccessionNumber = false;
790
791 text = text.trim();
792
793 // 1. For Isotypes often the accession number is noted alone if the
794 // preceeding entry has a collection code.
795 if(typeName .equals(TypesName.isotype) && collection != null){
796 Matcher m = accessionNumberOnlyPattern.matcher(text);
797 if(m.matches()){
798 try {
799 accessionNumber = m.group("accNumber");
800 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
801 } catch (IllegalArgumentException e){
802 // match group acc_number not found
803 }
804 }
805 }
806
807 //2. try it the 'normal' way
808 if(specimen == null) {
809 for (Pattern p : specimenTypePatterns) {
810 Matcher m = p.matcher(text);
811 if (m.matches()) {
812 // collection code or collectionTitle is mandatory
813 try {
814 collectionCode = m.group("colCode");
815 } catch (IllegalArgumentException e){
816 // match group colCode not found
817 }
818
819 try {
820 instituteStr = m.group("institute");
821 } catch (IllegalArgumentException e){
822 // match group col_name not found
823 }
824
825 try {
826 subCollectionStr = m.group("subCollection");
827 } catch (IllegalArgumentException e){
828 // match group subCollection not found
829 }
830 try {
831 accessionNumber = m.group("accNumber");
832
833 // try to improve the accessionNumber
834 if(accessionNumber!= null) {
835 accessionNumber = accessionNumber.trim();
836 Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
837 String betterAccessionNumber = null;
838 if (m2.matches()) {
839 try {
840 betterAccessionNumber = m.group("accNumber");
841 } catch (IllegalArgumentException e) {
842 // match group acc_number not found
843 }
844 }
845 if (betterAccessionNumber != null) {
846 accessionNumber = betterAccessionNumber;
847 } else {
848 unusualAccessionNumber = true;
849 }
850 }
851
852 } catch (IllegalArgumentException e){
853 // match group acc_number not found
854 }
855
856 if(collectionCode == null && instituteStr == null){
857 logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
858 continue;
859 }
860 collection = getCollection(collectionCode, instituteStr, subCollectionStr);
861 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
862 break;
863 }
864 }
865 }
866 if(specimen == null) {
867 logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
868 }
869 if(unusualAccessionNumber){
870 logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
871 }
872 return specimen;
873 }
874
875 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
876
877 DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
878 facade.setCollection(collection);
879 if(accessionNumber != null){
880 facade.setAccessionNumber(accessionNumber);
881 }
882 return facade.innerDerivedUnit();
883 }
884
885 private TaxonName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
886 String authorStr, String nomRefTitle) {
887
888 TaxonName taxonName;// cache field for the taxonName.titleCache
889 String taxonNameTitleCache = null;
890 Map<String, AnnotationType> nameAnnotations = new HashMap<>();
891
892 // TitleCache preprocessing
893 if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
894 nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
895 titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
896 if(authorStr != null) {
897 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
898 }
899 }
900
901 // parse the full taxon name
902 if(!StringUtils.isEmpty(nomRefTitle)){
903 String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
904 String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
905 logger.debug(":::::" + taxonFullNameStr);
906 taxonName = nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
907 } else {
908 taxonName = (TaxonName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
909 }
910
911 taxonNameTitleCache = taxonName.getTitleCache().trim();
912 if (taxonName.isProtectedTitleCache()) {
913 logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
914 } else {
915
916 boolean doRestoreTitleCacheStr = false;
917
918 // Check if titleCache and nameCache are plausible
919 String titleCacheCompareStr = titleCacheStr;
920 String nameCache = taxonName.getNameCache();
921 String nameCompareStr = nameStr;
922 if(taxonName.isBinomHybrid()){
923 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
924 nameCompareStr = nameCompareStr.replace(" x ", " ×");
925 }
926 if(taxonName.isMonomHybrid()){
927 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
928 nameCompareStr = nameCompareStr.replace("^X ", "× ");
929 }
930 if(authorStr != null && authorStr.contains(" et ")){
931 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
932 }
933 if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
934 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
935 doRestoreTitleCacheStr = true;
936 }
937 if (!nameCache.trim().equals(nameCompareStr)) {
938 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
939 }
940
941 // Author
942 //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
943 //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
944 // logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
945 // doRestoreTitleCacheStr = true;
946 //}
947
948 if(doRestoreTitleCacheStr){
949 taxonName.setTitleCache(titleCacheStr, true);
950 }
951
952 // deduplicate
953 replaceAuthorNamesAndNomRef(state, taxonName);
954 }
955
956 // Annotations
957 if(!nameAnnotations.isEmpty()){
958 for(String text : nameAnnotations.keySet()){
959 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
960 }
961 }
962
963 taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
964
965 getNameService().save(taxonName);
966
967 return taxonName;
968 }
969
970 /**
971 * @param state
972 * @return
973 */
974 private TaxonNode getClassificationRootNode(IAPTImportState state) {
975
976 // Classification classification = state.getClassification();
977 // if (classification == null){
978 // IAPTImportConfigurator config = state.getConfig();
979 // classification = Classification.NewInstance(state.getConfig().getClassificationName());
980 // classification.setUuid(config.getClassificationUuid());
981 // classification.setReference(config.getSecReference());
982 // classification = getClassificationService().find(state.getConfig().getClassificationUuid());
983 // }
984 TaxonNode rootNode = state.getRootNode();
985 if (rootNode == null){
986 rootNode = getTaxonNodeService().find(ROOT_UUID);
987 }
988 if (rootNode == null){
989 Classification classification = state.getClassification();
990 if (classification == null){
991 Reference sec = state.getSecReference();
992 String classificationName = state.getConfig().getClassificationName();
993 Language language = Language.DEFAULT();
994 classification = Classification.NewInstance(classificationName, sec, language);
995 state.setClassification(classification);
996 classification.setUuid(state.getConfig().getClassificationUuid());
997 classification.getRootNode().setUuid(ROOT_UUID);
998 getClassificationService().save(classification);
999 }
1000 rootNode = classification.getRootNode();
1001 state.setRootNode(rootNode);
1002 }
1003 return rootNode;
1004 }
1005
1006 private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
1007
1008 Collection superCollection = null;
1009 if(subCollectionStr != null){
1010 superCollection = getCollection(collectionCode, instituteStr, null);
1011 collectionCode = subCollectionStr;
1012 instituteStr = null;
1013 }
1014
1015 final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
1016
1017 Collection collection = collectionMap.get(key);
1018
1019 if(collection == null) {
1020 collection = Collection.NewInstance();
1021 collection.setCode(collectionCode);
1022 if(instituteStr != null){
1023 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
1024 }
1025 if(superCollection != null){
1026 collection.setSuperCollection(superCollection);
1027 }
1028 collectionMap.put(key, collection);
1029 if(!_testMode) {
1030 getCollectionService().save(collection);
1031 }
1032 }
1033
1034 return collection;
1035 }
1036
1037
1038 /**
1039 * @param record
1040 * @param originalKey
1041 * @param doUnescapeHtmlEntities
1042 * @return
1043 */
1044 private String getValue(Map<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
1045 String value = record.get(originalKey);
1046
1047 value = fixCharacters(value);
1048
1049 if (! StringUtils.isBlank(value)) {
1050 if (logger.isDebugEnabled()) {
1051 logger.debug(originalKey + ": " + value);
1052 }
1053 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1054 if(doUnescapeHtmlEntities){
1055 value = StringEscapeUtils.unescapeHtml(value);
1056 }
1057 return value.trim();
1058 }else{
1059 return null;
1060 }
1061 }
1062
1063 /**
1064 * Fixes broken characters.
1065 * For details see
1066 * https://dev.e-taxonomy.eu/redmine/issues/6035
1067 *
1068 * @param value
1069 * @return
1070 */
1071 private String fixCharacters(String value) {
1072
1073 value = StringUtils.replace(value, "s$K", "š");
1074 value = StringUtils.replace(value, "n$K", "ň");
1075 value = StringUtils.replace(value, "e$K", "ě");
1076 value = StringUtils.replace(value, "r$K", "ř");
1077 value = StringUtils.replace(value, "c$K", "č");
1078 value = StringUtils.replace(value, "z$K", "ž");
1079 value = StringUtils.replace(value, "S>U$K", "Š");
1080 value = StringUtils.replace(value, "C>U$K", "Č");
1081 value = StringUtils.replace(value, "R>U$K", "Ř");
1082 value = StringUtils.replace(value, "Z>U$K", "Ž");
1083 value = StringUtils.replace(value, "g$K", "ǧ");
1084 value = StringUtils.replace(value, "s$A", "ś");
1085 value = StringUtils.replace(value, "n$A", "ń");
1086 value = StringUtils.replace(value, "c$A", "ć");
1087 value = StringUtils.replace(value, "e$E", "ę");
1088 value = StringUtils.replace(value, "o$H", "õ");
1089 value = StringUtils.replace(value, "s$C", "ş");
1090 value = StringUtils.replace(value, "t$C", "ț");
1091 value = StringUtils.replace(value, "S>U$C", "Ş");
1092 value = StringUtils.replace(value, "a$O", "å");
1093 value = StringUtils.replace(value, "A>U$O", "Å");
1094 value = StringUtils.replace(value, "u$O", "ů");
1095 value = StringUtils.replace(value, "g$B", "ğ");
1096 value = StringUtils.replace(value, "g$B", "ĕ");
1097 value = StringUtils.replace(value, "a$B", "ă");
1098 value = StringUtils.replace(value, "l$/", "ł");
1099 value = StringUtils.replace(value, ">i", "ı");
1100 value = StringUtils.replace(value, "i$U", "ï");
1101 // Special-cases
1102 value = StringUtils.replace(value, "&yacute", "ý");
1103 value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1104 value = StringUtils.replace(value, "E>U$D", "З");
1105 value = StringUtils.replace(value, "S>U$E", "Ş");
1106 value = StringUtils.replace(value, "s$E", "ş");
1107
1108 value = StringUtils.replace(value, "c$k", "č");
1109 value = StringUtils.replace(value, " U$K", " Š");
1110
1111 value = StringUtils.replace(value, "O>U>!", "Ø");
1112 value = StringUtils.replace(value, "o>!", "ø");
1113 value = StringUtils.replace(value, "S$K", "Ŝ");
1114 value = StringUtils.replace(value, ">l", "ğ");
1115
1116 value = StringUtils.replace(value, "§B>i", "ł");
1117 value = StringUtils.replace(value, "¤", "ń");
1118
1119 return value;
1120 }
1121
1122
1123 /**
1124 * Stores taxa records in DB
1125 */
1126 @Override
1127 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1128
1129 if(excludeFromImport(state)){
1130 return;
1131 }
1132
1133 String lineNumber = "L#" + state.getCurrentLine() + ": ";
1134 LogUtils.setLevel(logger, Level.DEBUG);
1135 Map<String, String> record = state.getOriginalRecord();
1136 logger.debug(lineNumber + record.toString());
1137
1138 Set<String> keys = record.keySet();
1139 for (String key: keys) {
1140 if (! expectedKeys.contains(key)){
1141 logger.warn(lineNumber + "Unexpected Key: " + key);
1142 }
1143 }
1144
1145 String reg_id = record.get(REGISTRATIONNO_PK);
1146
1147 //higherTaxon
1148 String higherTaxaString = record.get(HIGHERTAXON);
1149 boolean isFossil = false;
1150 if(higherTaxaString.startsWith("FOSSIL ")){
1151 higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1152 isFossil = true;
1153 }
1154 TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1155
1156 //Taxon
1157 Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1158 if (taxon == null){
1159 logger.warn(lineNumber + "taxon could not be created and is null");
1160 return;
1161 }
1162 ((IAPTImportState)state).setCurrentTaxon(taxon);
1163
1164 // Registration
1165 IAPTRegData regData = makeIAPTRegData(state);
1166 ObjectMapper mapper = new ObjectMapper();
1167 try {
1168 String regdataJson = mapper.writeValueAsString(regData);
1169 Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1170 getNameService().save(taxon.getName());
1171 } catch (JsonProcessingException e) {
1172 logger.error("Error on converting IAPTRegData", e);
1173 }
1174
1175 logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1176 return;
1177 }
1178
1179 private boolean excludeFromImport(SimpleExcelTaxonImportState<CONFIG> state) {
1180 if(state.getConfig().isDoAlgeaeOnly()){
1181 boolean include = false;
1182 String higherTaxon = getValue(state.getOriginalRecord(), HIGHERTAXON, true);
1183 String fullNameStr = getValue(state.getOriginalRecord(), FULLNAME, true);
1184 include |= higherTaxon.matches(".*?PHYCEAE(?:$|\\s+)");
1185 for(String test : new String[]{
1186 "Bolidophyceae ",
1187 "Phaeothamniophyceae ",
1188 "Bolidomonadales ",
1189 "Bolidomonadaceae ",
1190 "Aureoumbra ",
1191 "Bolidomonas ",
1192 "Seagriefia ",
1193 "Navicula "
1194 }) {
1195 include |= fullNameStr.startsWith(test);
1196 }
1197 return !include;
1198 }
1199
1200 return false;
1201 }
1202
1203 private ExtensionType getExtensionTypeIAPTRegData() {
1204 if(extensionTypeIAPTRegData == null){
1205 extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1206 getTermService().save(extensionTypeIAPTRegData);
1207 }
1208 return extensionTypeIAPTRegData;
1209 }
1210
1211 private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1212
1213 Map<String, String> record = state.getOriginalRecord();
1214 String registrationStr = getValue(record, REGISTRATION);
1215 String regDateStr = getValue(record, REGDATE);
1216 String regStr = getValue(record, REGISTRATION, true);
1217
1218 String dateStr = null;
1219 String office = null;
1220 Integer regID = null;
1221 Integer formNo = null;
1222
1223 Matcher m = registrationPattern.matcher(registrationStr);
1224 if(m.matches()){
1225 dateStr = m.group("regdate");
1226 if(parseDate( regStr, dateStr) == null){
1227 // check for valid dates
1228 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1229 }
1230 office = m.group("office");
1231 regID = Integer.valueOf(m.group("regid"));
1232 try {
1233 formNo = Integer.valueOf(m.group("formNo"));
1234 } catch(IllegalArgumentException e){
1235 // ignore
1236 }
1237 } else {
1238 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1239 }
1240 IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1241 return regData;
1242 }
1243
1244 private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1245 String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1246 TaxonNode higherTaxonNode = null;
1247
1248 ITaxonTreeNode rootNode = getClassificationRootNode(state);
1249 for (String htn : higherTaxaNames) {
1250 htn = StringUtils.capitalize(htn.trim());
1251 Taxon higherTaxon = state.getHigherTaxon(htn);
1252 if (higherTaxon != null){
1253 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1254 }else{
1255 IBotanicalName name = makeHigherTaxonName(state, htn);
1256 Reference sec = state.getSecReference();
1257 higherTaxon = Taxon.NewInstance(name, sec);
1258 getTaxonService().save(higherTaxon);
1259 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1260 state.putHigherTaxon(htn, higherTaxon);
1261 getClassificationService().saveTreeNode(higherTaxonNode);
1262 }
1263 rootNode = higherTaxonNode;
1264 }
1265 return higherTaxonNode;
1266 }
1267
1268 private IBotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1269
1270 Rank rank = guessRank(name);
1271
1272 IBotanicalName taxonName = TaxonNameFactory.NewBotanicalInstance(rank);
1273 taxonName.addSource(makeOriginalSource(state));
1274 taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1275 return taxonName;
1276 }
1277
1278 private Rank guessRank(String name) {
1279
1280 // normalize
1281 name = name.replaceAll("\\(.*\\)", "").trim();
1282
1283 if(name.matches("^Plantae$|^Fungi$")){
1284 return Rank.KINGDOM();
1285 } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1286 return rankFamilyIncertisSedis();
1287 } else if(name.matches(".*phyta$|.*mycota$")){
1288 return Rank.PHYLUM();
1289 } else if(name.matches(".*phytina$|.*mycotina$")){
1290 return Rank.SUBPHYLUM();
1291 } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1292 return rankUnrankedSupraGeneric();
1293 } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1294 return Rank.CLASS();
1295 } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1296 return Rank.SUBCLASS();
1297 } else if(name.matches(".*ales$")){
1298 return Rank.ORDER();
1299 } else if(name.matches(".*ineae$")){
1300 return Rank.SUBORDER();
1301 } else if(name.matches(".*aceae$")){
1302 return Rank.FAMILY();
1303 } else if(name.matches(".*oideae$")){
1304 return Rank.SUBFAMILY();
1305 } else
1306 // if(name.matches(".*eae$")){
1307 // return Rank.TRIBE();
1308 // } else
1309 if(name.matches(".*inae$")){
1310 return Rank.SUBTRIBE();
1311 } else if(name.matches(".*ae$")){
1312 return Rank.FAMILY();
1313 }
1314 return Rank.UNKNOWN_RANK();
1315 }
1316
1317 private Rank rankUnrankedSupraGeneric() {
1318
1319 if(rankUnrankedSupraGeneric == null){
1320 rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1321 getTermService().save(rankUnrankedSupraGeneric);
1322 }
1323 return rankUnrankedSupraGeneric;
1324 }
1325
1326 private Rank rankFamilyIncertisSedis() {
1327
1328 if(familyIncertisSedis == null){
1329 familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1330 getTermService().save(familyIncertisSedis);
1331 }
1332 return familyIncertisSedis;
1333 }
1334
1335 private AnnotationType annotationTypeCaveats(){
1336 if(annotationTypeCaveats == null){
1337 annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1338 getTermService().save(annotationTypeCaveats);
1339 }
1340 return annotationTypeCaveats;
1341 }
1342
1343
1344 /**
1345 * @param state
1346 * @return
1347 */
1348 private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1349 return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1350 }
1351
1352
1353 private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1354 Reference ref = state.getReference(uuidRef);
1355 if (ref == null){
1356 ref = getReferenceService().find(uuidRef);
1357 state.putReference(uuidRef, ref);
1358 }
1359 return ref;
1360 }
1361
1362 private MarkerType markerTypeFossil(){
1363 if(this.markerTypeFossil == null){
1364 markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1365 getTermService().save(this.markerTypeFossil);
1366 }
1367 return markerTypeFossil;
1368 }
1369
1370 private MarkerType markerDuplicateRegistration(){
1371 if(this.duplicateRegistration == null){
1372 duplicateRegistration = MarkerType.NewInstance("duplicateRegistration", "duplicateRegistration", null);
1373 getTermService().save(this.duplicateRegistration);
1374 }
1375 return markerTypeFossil;
1376 }
1377
1378 private String csvReportLine(String regId, String message, String ... fields){
1379 StringBuilder out = new StringBuilder("regID#");
1380 out.append(regId).append(",\"").append(message).append('"');
1381
1382 for(String f : fields){
1383 out.append(",\"").append(f).append('"');
1384 }
1385 return out.toString();
1386 }
1387
1388
1389 }