fix #6094 importing 'registration' data into an Extension object
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.iapt;
11
12 import com.fasterxml.jackson.core.JsonProcessingException;
13 import com.fasterxml.jackson.databind.ObjectMapper;
14 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
15 import eu.etaxonomy.cdm.common.CdmUtils;
16 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
17 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
18 import eu.etaxonomy.cdm.model.agent.Institution;
19 import eu.etaxonomy.cdm.model.agent.Person;
20 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
21 import eu.etaxonomy.cdm.model.common.*;
22 import eu.etaxonomy.cdm.model.name.*;
23 import eu.etaxonomy.cdm.model.occurrence.*;
24 import eu.etaxonomy.cdm.model.occurrence.Collection;
25 import eu.etaxonomy.cdm.model.reference.Reference;
26 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
27 import eu.etaxonomy.cdm.model.taxon.*;
28 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
29 import org.apache.commons.lang.ArrayUtils;
30 import org.apache.commons.lang.StringEscapeUtils;
31 import org.apache.commons.lang.StringUtils;
32 import org.apache.log4j.Level;
33 import org.apache.log4j.Logger;
34 import org.joda.time.DateTimeFieldType;
35 import org.joda.time.Partial;
36 import org.joda.time.format.DateTimeFormat;
37 import org.joda.time.format.DateTimeFormatter;
38 import org.springframework.stereotype.Component;
39
40 import java.util.*;
41 import java.util.regex.Matcher;
42 import java.util.regex.Pattern;
43
44 /**
45 * @author a.mueller
46 * @created 05.01.2016
47 */
48
49 @Component("iAPTExcelImport")
50 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
51 private static final long serialVersionUID = -747486709409732371L;
52 private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
53 public static final String ANNOTATION_MARKER_STRING = "[*]";
54
55
56 private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
57
58 private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
59
60 private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
61 private final static String HIGHERTAXON= "HigherTaxon";
62 private final static String FULLNAME= "FullName";
63 private final static String AUTHORSSPELLING= "AuthorsSpelling";
64 private final static String LITSTRING= "LitString";
65 private final static String REGISTRATION= "Registration";
66 private final static String TYPE= "Type";
67 private final static String CAVEATS= "Caveats";
68 private final static String FULLBASIONYM= "FullBasionym";
69 private final static String FULLSYNSUBST= "FullSynSubst";
70 private final static String NOTESTXT= "NotesTxt";
71 private final static String REGDATE= "RegDate";
72 private final static String NAMESTRING= "NameString";
73 private final static String BASIONYMSTRING= "BasionymString";
74 private final static String SYNSUBSTSTR= "SynSubstStr";
75 private final static String AUTHORSTRING= "AuthorString";
76
77 private static List<String> expectedKeys= Arrays.asList(new String[]{
78 REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
79
80 private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
81 private static final Pattern[] datePatterns = new Pattern[]{
82 // NOTE:
83 // The order of the patterns is extremely important!!!
84 //
85 // all patterns cover the years 1700 - 1999
86 Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
87 Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
88 Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
89 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
90 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
91 Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
92 Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
93 Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
94 Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
95 Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
96 };
97 protected static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
98
99 private static final Pattern typeNameBasionymPattern = Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
100 private static final Pattern typeNameNotePattern = Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
101 private static final Pattern typeNameSpecialSplitPattern = Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
102
103 protected static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
104 private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
105 private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
106
107 // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
108 private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
109
110 private static final Pattern[] specimenTypePatterns = new Pattern[]{
111 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
112 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
113 Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
114 Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
115 Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
116 };
117
118
119 private static final Pattern registrationPattern = Pattern.compile("^Registration date\\:\\s(?<regdate>\\d\\d\\.\\d\\d\\.\\d\\d); no\\.\\:\\s(?<regid>\\d+);\\soffice\\:\\s(?<office>.*?)\\.(?:\\s\\[Form no\\.\\:\\s(?<formNo>d+)\\])?$"); // Registration date: 29.06.98; no.: 2922; office: Berlin.
120
121 private static Map<String, Integer> monthFromNameMap = new HashMap<>();
122
123 static {
124 String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
125 String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
126 String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
127 String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
128 String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
129 String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
130 String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
131 String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
132 String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
133 String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
134
135 String[][] perLang = new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
136
137 for (String[] months: perLang) {
138 for(int m = 1; m < 13; m++){
139 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
140 }
141 }
142
143 // special cases
144 monthFromNameMap.put("mar", 3);
145 monthFromNameMap.put("dec", 12);
146 monthFromNameMap.put("februari", 2);
147 monthFromNameMap.put("març", 3);
148 }
149
150
151 DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
152
153 private Map<String, Collection> collectionMap = new HashMap<>();
154 private ExtensionType extensionTypeIAPTRegData = null;
155
156
157 enum TypesName {
158 fieldUnit, holotype, isotype;
159
160 public SpecimenTypeDesignationStatus status(){
161 switch (this) {
162 case holotype:
163 return SpecimenTypeDesignationStatus.HOLOTYPE();
164 case isotype:
165 return SpecimenTypeDesignationStatus.ISOTYPE();
166 default:
167 return null;
168 }
169 }
170 }
171
172 private MarkerType markerTypeFossil = null;
173 private Rank rankUnrankedSupraGeneric = null;
174 private Rank familyIncertisSedis = null;
175 private AnnotationType annotationTypeCaveats = null;
176
177 private Reference bookVariedadesTradicionales = null;
178
179 /**
180 * HACK for unit simple testing
181 */
182 boolean _testMode = System.getProperty("TEST_MODE") != null;
183
184 private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
185 TaxonNode higherTaxonNode, boolean isFossil) {
186
187 String regNumber = getValue(record, REGISTRATIONNO_PK, false);
188 String regStr = getValue(record, REGISTRATION, true);
189 String titleCacheStr = getValue(record, FULLNAME, true);
190 String nameStr = getValue(record, NAMESTRING, true);
191 String authorStr = getValue(record, AUTHORSTRING, true);
192 String nomRefStr = getValue(record, LITSTRING, true);
193 String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
194 String notesTxt = getValue(record, NOTESTXT, true);
195 String caveats = getValue(record, CAVEATS, true);
196 String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
197 String fullBasionymStr = getValue(record, FULLBASIONYM, true);
198 String basionymNameStr = getValue(record, FULLBASIONYM, true);
199 String synSubstStr = getValue(record, SYNSUBSTSTR, true);
200 String typeStr = getValue(record, TYPE, true);
201
202
203 String nomRefTitle = null;
204 String nomRefDetail;
205 String nomRefPupDate = null;
206 String nomRefIssue = null;
207 Partial pupDate = null;
208
209 boolean restoreOriginalReference = false;
210 boolean nameIsValid = true;
211
212 // preprocess nomRef: separate citation, reference detail, publishing date
213 if(!StringUtils.isEmpty(nomRefStr)){
214 nomRefStr = nomRefStr.trim();
215
216 // handle the special case which is hard to parse:
217 //
218 // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
219 if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
220
221 if(bookVariedadesTradicionales == null){
222 bookVariedadesTradicionales = ReferenceFactory.newBook();
223 bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
224 bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
225 getReferenceService().save(bookVariedadesTradicionales);
226 }
227 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
228 restoreOriginalReference = true;
229 }
230
231 Matcher m = nomRefTokenizeP.matcher(nomRefStr);
232 if(m.matches()){
233 nomRefTitle = m.group("title");
234 nomRefDetail = m.group("detail");
235 nomRefPupDate = m.group("date").trim();
236 nomRefIssue = m.group("issue");
237
238 pupDate = parseDate(regNumber, nomRefPupDate);
239 if (pupDate != null) {
240 nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
241 } else {
242 logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
243 }
244 } else {
245 nomRefTitle = nomRefStr;
246 }
247 }
248
249 BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
250
251 // always add the original strings of parsed data as annotation
252 taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
253 "\n - '" + LITSTRING + "': "+ nomRefStr +
254 "\n - '" + TYPE + "': " + typeStr +
255 "\n - '" + REGISTRATION + "': " + regStr
256 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
257
258 if(restoreOriginalReference){
259 taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
260 }
261 if(pupDate != null) {
262 taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
263 }
264 if(nomRefIssue != null) {
265 ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
266 }
267
268
269 if(!StringUtils.isEmpty(notesTxt)){
270 notesTxt = notesTxt.replace("Notes: ", "").trim();
271 taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
272 nameIsValid = false;
273
274 }
275 if(!StringUtils.isEmpty(caveats)){
276 caveats = caveats.replace("Caveats: ", "").trim();
277 taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
278 nameIsValid = false;
279 }
280
281 if(nameIsValid){
282 // Status is always considered valid if no notes and cavets are set
283 taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
284 }
285
286 getNameService().save(taxonName);
287
288 // Namerelations
289 if(!StringUtils.isEmpty(authorsSpelling)){
290 authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
291
292 String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
293 String[] nameStrTokens = StringUtils.split(nameStr, " ");
294
295 ArrayUtils.reverse(authorSpellingTokens);
296 ArrayUtils.reverse(nameStrTokens);
297
298 for (int i = 0; i < nameStrTokens.length; i++){
299 if(i < authorSpellingTokens.length){
300 nameStrTokens[i] = authorSpellingTokens[i];
301 }
302 }
303 ArrayUtils.reverse(nameStrTokens);
304
305 String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
306 // build the fullnameString of the misspelled name
307 misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
308
309 TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
310 misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
311 getNameService().save(misspelledName);
312 }
313
314 // Replaced Synonyms
315 if(!StringUtils.isEmpty(fullSynSubstStr)){
316 fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
317 BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
318 replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
319 getNameService().save(replacedSynonymName);
320 }
321
322 Reference sec = state.getConfig().getSecReference();
323 Taxon taxon = Taxon.NewInstance(taxonName, sec);
324
325 // Basionym
326 if(fullBasionymStr != null){
327 fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
328 basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
329 BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
330 getNameService().save(basionym);
331 taxonName.addBasionym(basionym);
332
333 Synonym syn = Synonym.NewInstance(basionym, sec);
334 taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
335 getTaxonService().save(syn);
336 }
337
338 // Markers
339 if(isFossil){
340 taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
341 }
342
343 // Types
344 if(!StringUtils.isEmpty(typeStr)){
345
346 if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
347 makeSpecimenTypeData(typeStr, taxonName, regNumber, state, false);
348 } else {
349 makeNameTypeData(typeStr, taxonName, regNumber, state);
350 }
351 }
352
353 getTaxonService().save(taxon);
354
355 if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
356 // try to find the genus, it should have been imported already, Genera are coming first in the import file
357 Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
358 if(genus != null){
359 higherTaxonNode = genus.getTaxonNodes().iterator().next();
360 } else {
361 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
362 }
363 }
364
365 if(higherTaxonNode != null){
366 higherTaxonNode.addChildTaxon(taxon, null, null);
367 getTaxonNodeService().save(higherTaxonNode);
368 }
369
370 if(taxonName.getRank().isGenus()){
371 ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
372 }
373
374 return taxon;
375 }
376
377 private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state, boolean isFossil) {
378
379 Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
380
381 if(m.matches()){
382 String fieldUnitStr = m.group(TypesName.fieldUnit.name());
383 // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
384 FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
385 if(fieldUnit == null) {
386 // create a field unit with only a titleCache using the fieldUnitStr substring
387 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
388 fieldUnit = FieldUnit.NewInstance();
389 fieldUnit.setTitleCache(fieldUnitStr, true);
390 getOccurrenceService().save(fieldUnit);
391 }
392 getOccurrenceService().save(fieldUnit);
393
394 SpecimenOrObservationType specimenType;
395 if(isFossil){
396 specimenType = SpecimenOrObservationType.Fossil;
397 } else {
398 specimenType = SpecimenOrObservationType.PreservedSpecimen;
399 }
400
401 // all others ..
402 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber, specimenType);
403 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber, specimenType);
404
405 } else {
406 // create a field unit with only a titleCache using the full typeStr
407 FieldUnit fieldUnit = FieldUnit.NewInstance();
408 fieldUnit.setTitleCache(typeStr, true);
409 getOccurrenceService().save(fieldUnit);
410 logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
411 }
412 getNameService().save(taxonName);
413 }
414
415 private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
416
417 String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
418 if(nameStr.isEmpty()) {
419 return;
420 }
421
422 String basionymNameStr = null;
423 String noteStr = null;
424 String agentStr = null;
425
426 Matcher m;
427
428 if(typeStr.startsWith("not to be indicated")){
429 // Special case:
430 // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
431 // FIXME
432 m = typeNameSpecialSplitPattern.matcher(nameStr);
433 if(m.matches()){
434 nameStr = m.group("name");
435 noteStr = m.group("note");
436 agentStr = m.group("agent");
437 // TODO better import of agent?
438 if(agentStr != null){
439 noteStr = noteStr + ": " + agentStr;
440 }
441 }
442 } else {
443 // Generic case
444 m = typeNameBasionymPattern.matcher(nameStr);
445 if (m.find()) {
446 basionymNameStr = m.group("basionymName");
447 if (basionymNameStr != null) {
448 nameStr = nameStr.replace(m.group(0), "");
449 }
450 }
451
452 m = typeNameNotePattern.matcher(nameStr);
453 if (m.find()) {
454 noteStr = m.group(1);
455 if (noteStr != null) {
456 nameStr = nameStr.replace(m.group(0), "");
457 }
458 }
459 }
460
461 BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
462
463 if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
464 logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
465 }
466
467 if(basionymNameStr != null){
468 BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
469 getNameService().save(basionymName);
470 typeName.addBasionym(basionymName);
471 }
472
473
474 NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
475 nameTypeDesignation.setTypeName(typeName);
476 getNameService().save(typeName);
477
478 if(noteStr != null){
479 nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
480 }
481 taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
482
483 }
484
485 /**
486 * Currently only parses the collector, fieldNumber and the collection date.
487 *
488 * @param fieldUnitStr
489 * @param regNumber
490 * @param state
491 * @return null if the fieldUnitStr could not be parsed
492 */
493 protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
494
495 FieldUnit fieldUnit = null;
496
497 Matcher m1 = collectorPattern.matcher(fieldUnitStr);
498 if(m1.matches()){
499
500 String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
501 String removal = m1.group(1);
502 if(collectorData == null){
503 collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
504 removal = m1.group(3);
505 }
506 if(collectorData == null){
507 collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
508 removal = null;
509 }
510 if(collectorData == null){
511 return null;
512 }
513
514 // the fieldUnitStr is parsable
515 // remove all collectorData from the fieldUnitStr and use the rest as locality
516 String locality = null;
517 if(removal != null){
518 locality = fieldUnitStr.replace(removal, "");
519 }
520
521 String collectorStr = null;
522 String detailStr = null;
523 Partial date = null;
524 String fieldNumber = null;
525
526 Matcher m2 = collectionDataPattern.matcher(collectorData);
527 if(m2.matches()){
528 collectorStr = m2.group("collector");
529 detailStr = m2.group("detail");
530
531 // Try to make sense of the detailStr
532 if(detailStr != null){
533 detailStr = detailStr.trim();
534 // 1. try to parse as date
535 date = parseDate(regNumber, detailStr);
536 if(date == null){
537 // 2. try to parse as number
538 if(collectorsNumber.matcher(detailStr).matches()){
539 fieldNumber = detailStr;
540 }
541 }
542 }
543 if(date == null && fieldNumber == null){
544 // detailed parsing not possible, so need fo fallback
545 collectorStr = collectorData;
546 }
547 }
548
549 if(collectorStr == null) {
550 collectorStr = collectorData;
551 }
552
553 fieldUnit = FieldUnit.NewInstance();
554 GatheringEvent ge = GatheringEvent.NewInstance();
555 if(locality != null){
556 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
557 }
558
559 TeamOrPersonBase agent = state.getAgentBase(collectorStr);
560 if(agent == null) {
561 agent = Person.NewTitledInstance(collectorStr);
562 getAgentService().save(agent);
563 state.putAgentBase(collectorStr, agent);
564 }
565 ge.setCollector(agent);
566
567 if(date != null){
568 ge.setGatheringDate(date);
569 }
570
571 getEventBaseService().save(ge);
572 fieldUnit.setGatheringEvent(ge);
573
574 if(fieldNumber != null) {
575 fieldUnit.setFieldNumber(fieldNumber);
576 }
577 getOccurrenceService().save(fieldUnit);
578
579 }
580
581 return fieldUnit;
582 }
583
584 protected Partial parseDate(String regNumber, String dateStr) {
585
586 Partial pupDate = null;
587 boolean parseError = false;
588
589 String day = null;
590 String month = null;
591 String monthName = null;
592 String year = null;
593
594 for(Pattern p : datePatterns){
595 Matcher m2 = p.matcher(dateStr);
596 if(m2.matches()){
597 try {
598 year = m2.group("year");
599 } catch (IllegalArgumentException e){
600 // named capture group not found
601 }
602 try {
603 month = m2.group("month");
604 } catch (IllegalArgumentException e){
605 // named capture group not found
606 }
607
608 try {
609 monthName = m2.group("monthName");
610 month = monthFromName(monthName, regNumber);
611 if(month == null){
612 parseError = true;
613 }
614 } catch (IllegalArgumentException e){
615 // named capture group not found
616 }
617 try {
618 day = m2.group("day");
619 } catch (IllegalArgumentException e){
620 // named capture group not found
621 }
622
623 if(year != null){
624 if (year.length() == 2) {
625 // it is an abbreviated year from the 19** years
626 year = "19" + year;
627 }
628 break;
629 } else {
630 parseError = true;
631 }
632 }
633 }
634 if(year == null){
635 parseError = true;
636 }
637 List<DateTimeFieldType> types = new ArrayList<>();
638 List<Integer> values = new ArrayList<>();
639 if(!parseError) {
640 types.add(DateTimeFieldType.year());
641 values.add(Integer.parseInt(year));
642 if (month != null) {
643 types.add(DateTimeFieldType.monthOfYear());
644 values.add(Integer.parseInt(month));
645 }
646 if (day != null) {
647 types.add(DateTimeFieldType.dayOfMonth());
648 values.add(Integer.parseInt(day));
649 }
650 pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
651 }
652 return pupDate;
653 }
654
655 private String monthFromName(String monthName, String regNumber) {
656
657 Integer month = monthFromNameMap.get(monthName.toLowerCase());
658 if(month == null){
659 logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
660 return null;
661 } else {
662 return month.toString();
663 }
664 }
665
666
667 private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber, SpecimenOrObservationType specimenType){
668
669 if(StringUtils.isEmpty(typeStr)){
670 return;
671 }
672 typeStr = typeStr.trim().replaceAll("\\.$", "");
673
674 Collection collection = null;
675 DerivedUnit specimen = null;
676
677 List<DerivedUnit> specimens = new ArrayList<>();
678 if(multiple){
679 String[] tokens = typeStr.split("\\s?,\\s?");
680 for (String t : tokens) {
681 // command to list all complex parsabel types:
682 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
683 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
684
685 if(!t.isEmpty()){
686 // trying to parse the string
687 specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
688 if(specimen != null){
689 specimens.add(specimen);
690 } else {
691 // parsing was not successful make simple specimen
692 specimens.add(makeSpecimenType(fieldUnit, t, specimenType));
693 }
694 }
695 }
696 } else {
697 specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
698 if(specimen != null) {
699 specimens.add(specimen);
700 // remember current collection
701 collection = specimen.getCollection();
702 } else {
703 // parsing was not successful make simple specimen
704 specimens.add(makeSpecimenType(fieldUnit, typeStr, SpecimenOrObservationType.PreservedSpecimen));
705 }
706 }
707
708 for(DerivedUnit s : specimens){
709 taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
710 }
711 }
712
713 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache, SpecimenOrObservationType specimenType) {
714 DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(specimenType, fieldUnit);
715 facade.setTitleCache(titleCache.trim(), true);
716 specimen = facade.innerDerivedUnit();
717 return specimen;
718 }
719
720 /**
721 *
722 * @param fieldUnit
723 * @param typeName
724 * @param collection
725 * @param text
726 * @param regNumber
727 * @return
728 */
729 protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
730
731 DerivedUnit specimen = null;
732
733 String collectionCode = null;
734 String collectionTitle = null;
735 String subCollectionStr = null;
736 String instituteStr = null;
737 String accessionNumber = null;
738
739 boolean unusualAccessionNumber = false;
740
741 text = text.trim();
742
743 // 1. For Isotypes often the accession number is noted alone if the
744 // preceeding entry has a collection code.
745 if(typeName .equals(TypesName.isotype) && collection != null){
746 Matcher m = accessionNumberOnlyPattern.matcher(text);
747 if(m.matches()){
748 try {
749 accessionNumber = m.group("accNumber");
750 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
751 } catch (IllegalArgumentException e){
752 // match group acc_number not found
753 }
754 }
755 }
756
757 //2. try it the 'normal' way
758 if(specimen == null) {
759 for (Pattern p : specimenTypePatterns) {
760 Matcher m = p.matcher(text);
761 if (m.matches()) {
762 // collection code or collectionTitle is mandatory
763 try {
764 collectionCode = m.group("colCode");
765 } catch (IllegalArgumentException e){
766 // match group colCode not found
767 }
768
769 try {
770 instituteStr = m.group("institute");
771 } catch (IllegalArgumentException e){
772 // match group col_name not found
773 }
774
775 try {
776 subCollectionStr = m.group("subCollection");
777 } catch (IllegalArgumentException e){
778 // match group subCollection not found
779 }
780 try {
781 accessionNumber = m.group("accNumber");
782
783 // try to improve the accessionNumber
784 if(accessionNumber!= null) {
785 accessionNumber = accessionNumber.trim();
786 Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
787 String betterAccessionNumber = null;
788 if (m2.matches()) {
789 try {
790 betterAccessionNumber = m.group("accNumber");
791 } catch (IllegalArgumentException e) {
792 // match group acc_number not found
793 }
794 }
795 if (betterAccessionNumber != null) {
796 accessionNumber = betterAccessionNumber;
797 } else {
798 unusualAccessionNumber = true;
799 }
800 }
801
802 } catch (IllegalArgumentException e){
803 // match group acc_number not found
804 }
805
806 if(collectionCode == null && instituteStr == null){
807 logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
808 continue;
809 }
810 collection = getCollection(collectionCode, instituteStr, subCollectionStr);
811 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
812 break;
813 }
814 }
815 }
816 if(specimen == null) {
817 logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
818 }
819 if(unusualAccessionNumber){
820 logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
821 }
822 return specimen;
823 }
824
825 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
826
827 DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
828 facade.setCollection(collection);
829 if(accessionNumber != null){
830 facade.setAccessionNumber(accessionNumber);
831 }
832 return facade.innerDerivedUnit();
833 }
834
835 private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
836 String authorStr, String nomRefTitle) {
837
838 BotanicalName taxonName;// cache field for the taxonName.titleCache
839 String taxonNameTitleCache = null;
840 Map<String, AnnotationType> nameAnnotations = new HashMap<>();
841
842 // TitleCache preprocessing
843 if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
844 nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
845 titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
846 if(authorStr != null) {
847 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
848 }
849 }
850
851 // parse the full taxon name
852 if(!StringUtils.isEmpty(nomRefTitle)){
853 String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
854 String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
855 logger.debug(":::::" + taxonFullNameStr);
856 taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
857 } else {
858 taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
859 }
860
861 taxonNameTitleCache = taxonName.getTitleCache().trim();
862 if (taxonName.isProtectedTitleCache()) {
863 logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
864 } else {
865
866 boolean doRestoreTitleCacheStr = false;
867
868 // Check if titleCache and nameCache are plausible
869 String titleCacheCompareStr = titleCacheStr;
870 String nameCache = taxonName.getNameCache();
871 String nameCompareStr = nameStr;
872 if(taxonName.isBinomHybrid()){
873 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
874 nameCompareStr = nameCompareStr.replace(" x ", " ×");
875 }
876 if(taxonName.isMonomHybrid()){
877 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
878 nameCompareStr = nameCompareStr.replace("^X ", "× ");
879 }
880 if(authorStr != null && authorStr.contains(" et ")){
881 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
882 }
883 if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
884 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
885 doRestoreTitleCacheStr = true;
886 }
887 if (!nameCache.trim().equals(nameCompareStr)) {
888 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
889 }
890
891 // Author
892 //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
893 //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
894 // logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
895 // doRestoreTitleCacheStr = true;
896 //}
897
898 if(doRestoreTitleCacheStr){
899 taxonName.setTitleCache(titleCacheStr, true);
900 }
901
902 // deduplicate
903 replaceAuthorNamesAndNomRef(state, taxonName);
904 }
905
906 // Annotations
907 if(!nameAnnotations.isEmpty()){
908 for(String text : nameAnnotations.keySet()){
909 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
910 }
911 }
912
913 taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
914
915 getNameService().save(taxonName);
916
917 return taxonName;
918 }
919
920 /**
921 * @param state
922 * @return
923 */
924 private TaxonNode getClassificationRootNode(IAPTImportState state) {
925
926 // Classification classification = state.getClassification();
927 // if (classification == null){
928 // IAPTImportConfigurator config = state.getConfig();
929 // classification = Classification.NewInstance(state.getConfig().getClassificationName());
930 // classification.setUuid(config.getClassificationUuid());
931 // classification.setReference(config.getSecReference());
932 // classification = getClassificationService().find(state.getConfig().getClassificationUuid());
933 // }
934 TaxonNode rootNode = state.getRootNode();
935 if (rootNode == null){
936 rootNode = getTaxonNodeService().find(ROOT_UUID);
937 }
938 if (rootNode == null){
939 Classification classification = state.getClassification();
940 if (classification == null){
941 Reference sec = state.getSecReference();
942 String classificationName = state.getConfig().getClassificationName();
943 Language language = Language.DEFAULT();
944 classification = Classification.NewInstance(classificationName, sec, language);
945 state.setClassification(classification);
946 classification.setUuid(state.getConfig().getClassificationUuid());
947 classification.getRootNode().setUuid(ROOT_UUID);
948 getClassificationService().save(classification);
949 }
950 rootNode = classification.getRootNode();
951 state.setRootNode(rootNode);
952 }
953 return rootNode;
954 }
955
956 private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
957
958 Collection superCollection = null;
959 if(subCollectionStr != null){
960 superCollection = getCollection(collectionCode, instituteStr, null);
961 collectionCode = subCollectionStr;
962 instituteStr = null;
963 }
964
965 final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
966
967 Collection collection = collectionMap.get(key);
968
969 if(collection == null) {
970 collection = Collection.NewInstance();
971 collection.setCode(collectionCode);
972 if(instituteStr != null){
973 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
974 }
975 if(superCollection != null){
976 collection.setSuperCollection(superCollection);
977 }
978 collectionMap.put(key, collection);
979 if(!_testMode) {
980 getCollectionService().save(collection);
981 }
982 }
983
984 return collection;
985 }
986
987
988 /**
989 * @param record
990 * @param originalKey
991 * @param doUnescapeHtmlEntities
992 * @return
993 */
994 private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
995 String value = record.get(originalKey);
996
997 value = fixCharacters(value);
998
999 if (! StringUtils.isBlank(value)) {
1000 if (logger.isDebugEnabled()) {
1001 logger.debug(originalKey + ": " + value);
1002 }
1003 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
1004 if(doUnescapeHtmlEntities){
1005 value = StringEscapeUtils.unescapeHtml(value);
1006 }
1007 return value.trim();
1008 }else{
1009 return null;
1010 }
1011 }
1012
1013 /**
1014 * Fixes broken characters.
1015 * For details see
1016 * http://dev.e-taxonomy.eu/redmine/issues/6035
1017 *
1018 * @param value
1019 * @return
1020 */
1021 private String fixCharacters(String value) {
1022
1023 value = StringUtils.replace(value, "s$K", "š");
1024 value = StringUtils.replace(value, "n$K", "ň");
1025 value = StringUtils.replace(value, "e$K", "ě");
1026 value = StringUtils.replace(value, "r$K", "ř");
1027 value = StringUtils.replace(value, "c$K", "č");
1028 value = StringUtils.replace(value, "z$K", "ž");
1029 value = StringUtils.replace(value, "S>U$K", "Š");
1030 value = StringUtils.replace(value, "C>U$K", "Č");
1031 value = StringUtils.replace(value, "R>U$K", "Ř");
1032 value = StringUtils.replace(value, "Z>U$K", "Ž");
1033 value = StringUtils.replace(value, "g$K", "ǧ");
1034 value = StringUtils.replace(value, "s$A", "ś");
1035 value = StringUtils.replace(value, "n$A", "ń");
1036 value = StringUtils.replace(value, "c$A", "ć");
1037 value = StringUtils.replace(value, "e$E", "ę");
1038 value = StringUtils.replace(value, "o$H", "õ");
1039 value = StringUtils.replace(value, "s$C", "ş");
1040 value = StringUtils.replace(value, "t$C", "ț");
1041 value = StringUtils.replace(value, "S>U$C", "Ş");
1042 value = StringUtils.replace(value, "a$O", "å");
1043 value = StringUtils.replace(value, "A>U$O", "Å");
1044 value = StringUtils.replace(value, "u$O", "ů");
1045 value = StringUtils.replace(value, "g$B", "ğ");
1046 value = StringUtils.replace(value, "g$B", "ĕ");
1047 value = StringUtils.replace(value, "a$B", "ă");
1048 value = StringUtils.replace(value, "l$/", "ł");
1049 value = StringUtils.replace(value, ">i", "ı");
1050 value = StringUtils.replace(value, "i$U", "ï");
1051 // Special-cases
1052 value = StringUtils.replace(value, "&yacute", "ý");
1053 value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1054 value = StringUtils.replace(value, "E>U$D", "З");
1055 value = StringUtils.replace(value, "S>U$E", "Ş");
1056 value = StringUtils.replace(value, "s$E", "ş");
1057
1058 value = StringUtils.replace(value, "c$k", "č");
1059 value = StringUtils.replace(value, " U$K", " Š");
1060
1061 value = StringUtils.replace(value, "O>U>!", "Ø");
1062 value = StringUtils.replace(value, "o>!", "ø");
1063 value = StringUtils.replace(value, "S$K", "Ŝ");
1064 value = StringUtils.replace(value, ">l", "ğ");
1065
1066 value = StringUtils.replace(value, "§B>i", "ł");
1067
1068
1069
1070 return value;
1071 }
1072
1073
1074 /**
1075 * Stores taxa records in DB
1076 */
1077 @Override
1078 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1079
1080 String lineNumber = "L#" + state.getCurrentLine() + ": ";
1081 logger.setLevel(Level.DEBUG);
1082 HashMap<String, String> record = state.getOriginalRecord();
1083 logger.debug(lineNumber + record.toString());
1084
1085 Set<String> keys = record.keySet();
1086 for (String key: keys) {
1087 if (! expectedKeys.contains(key)){
1088 logger.warn(lineNumber + "Unexpected Key: " + key);
1089 }
1090 }
1091
1092 String reg_id = record.get(REGISTRATIONNO_PK);
1093
1094 //higherTaxon
1095 String higherTaxaString = record.get(HIGHERTAXON);
1096 boolean isFossil = false;
1097 if(higherTaxaString.startsWith("FOSSIL ")){
1098 higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1099 isFossil = true;
1100 }
1101 TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1102
1103 //Taxon
1104 Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1105 if (taxon == null){
1106 logger.warn(lineNumber + "taxon could not be created and is null");
1107 return;
1108 }
1109 ((IAPTImportState)state).setCurrentTaxon(taxon);
1110
1111 // Registration
1112 IAPTRegData regData = makeIAPTRegData(state);
1113 ObjectMapper mapper = new ObjectMapper();
1114 try {
1115 String regdataJson = mapper.writeValueAsString(regData);
1116 Extension.NewInstance(taxon.getName(), regdataJson, getExtensionTypeIAPTRegData());
1117 getNameService().save(taxon.getName());
1118 } catch (JsonProcessingException e) {
1119 logger.error("Error on converting IAPTRegData", e);
1120 }
1121
1122 logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1123 return;
1124 }
1125
1126 private ExtensionType getExtensionTypeIAPTRegData() {
1127 if(extensionTypeIAPTRegData == null){
1128 extensionTypeIAPTRegData = ExtensionType.NewInstance("IAPTRegData.json", "IAPTRegData.json", "");
1129 getTermService().save(extensionTypeIAPTRegData);
1130 }
1131 return extensionTypeIAPTRegData;
1132 }
1133
1134 private IAPTRegData makeIAPTRegData(SimpleExcelTaxonImportState<CONFIG> state) {
1135
1136 HashMap<String, String> record = state.getOriginalRecord();
1137 String registrationStr = getValue(record, REGISTRATION);
1138 String regDateStr = getValue(record, REGDATE);
1139 String regStr = getValue(record, REGISTRATION, true);
1140
1141 String dateStr = null;
1142 String office = null;
1143 Integer regID = null;
1144 Integer formNo = null;
1145
1146 Matcher m = registrationPattern.matcher(registrationStr);
1147 if(m.matches()){
1148 dateStr = m.group("regdate");
1149 if(parseDate( regStr, dateStr) == null){
1150 // check for valid dates
1151 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not parse date", dateStr, " in ", registrationStr));
1152 };
1153 office = m.group("office");
1154 regID = Integer.valueOf(m.group("regid"));
1155 try {
1156 formNo = Integer.valueOf(m.group("formNo"));
1157 } catch(IllegalArgumentException e){
1158 // ignore
1159 }
1160 } else {
1161 logger.warn(csvReportLine(regStr, REGISTRATION + ": could not be parsed", registrationStr));
1162 }
1163 IAPTRegData regData = new IAPTRegData(dateStr, office, regID, formNo);
1164 return regData;
1165 }
1166
1167 private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1168 String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1169 TaxonNode higherTaxonNode = null;
1170
1171 ITaxonTreeNode rootNode = getClassificationRootNode(state);
1172 for (String htn : higherTaxaNames) {
1173 htn = StringUtils.capitalize(htn.trim());
1174 Taxon higherTaxon = state.getHigherTaxon(htn);
1175 if (higherTaxon != null){
1176 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1177 }else{
1178 BotanicalName name = makeHigherTaxonName(state, htn);
1179 Reference sec = state.getSecReference();
1180 higherTaxon = Taxon.NewInstance(name, sec);
1181 getTaxonService().save(higherTaxon);
1182 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1183 state.putHigherTaxon(htn, higherTaxon);
1184 getClassificationService().saveTreeNode(higherTaxonNode);
1185 }
1186 rootNode = higherTaxonNode;
1187 }
1188 return higherTaxonNode;
1189 }
1190
1191 private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1192
1193 Rank rank = guessRank(name);
1194
1195 BotanicalName taxonName = BotanicalName.NewInstance(rank);
1196 taxonName.addSource(makeOriginalSource(state));
1197 taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1198 return taxonName;
1199 }
1200
1201 private Rank guessRank(String name) {
1202
1203 // normalize
1204 name = name.replaceAll("\\(.*\\)", "").trim();
1205
1206 if(name.matches("^Plantae$|^Fungi$")){
1207 return Rank.KINGDOM();
1208 } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1209 return rankFamilyIncertisSedis();
1210 } else if(name.matches(".*phyta$|.*mycota$")){
1211 return Rank.PHYLUM();
1212 } else if(name.matches(".*phytina$|.*mycotina$")){
1213 return Rank.SUBPHYLUM();
1214 } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1215 return rankUnrankedSupraGeneric();
1216 } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1217 return Rank.CLASS();
1218 } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1219 return Rank.SUBCLASS();
1220 } else if(name.matches(".*ales$")){
1221 return Rank.ORDER();
1222 } else if(name.matches(".*ineae$")){
1223 return Rank.SUBORDER();
1224 } else if(name.matches(".*aceae$")){
1225 return Rank.FAMILY();
1226 } else if(name.matches(".*oideae$")){
1227 return Rank.SUBFAMILY();
1228 } else
1229 // if(name.matches(".*eae$")){
1230 // return Rank.TRIBE();
1231 // } else
1232 if(name.matches(".*inae$")){
1233 return Rank.SUBTRIBE();
1234 } else if(name.matches(".*ae$")){
1235 return Rank.FAMILY();
1236 }
1237 return Rank.UNKNOWN_RANK();
1238 }
1239
1240 private Rank rankUnrankedSupraGeneric() {
1241
1242 if(rankUnrankedSupraGeneric == null){
1243 rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1244 getTermService().save(rankUnrankedSupraGeneric);
1245 }
1246 return rankUnrankedSupraGeneric;
1247 }
1248
1249 private Rank rankFamilyIncertisSedis() {
1250
1251 if(familyIncertisSedis == null){
1252 familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1253 getTermService().save(familyIncertisSedis);
1254 }
1255 return familyIncertisSedis;
1256 }
1257
1258 private AnnotationType annotationTypeCaveats(){
1259 if(annotationTypeCaveats == null){
1260 annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1261 getTermService().save(annotationTypeCaveats);
1262 }
1263 return annotationTypeCaveats;
1264 }
1265
1266
1267 /**
1268 * @param state
1269 * @return
1270 */
1271 private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1272 return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1273 }
1274
1275
1276 private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1277 Reference ref = state.getReference(uuidRef);
1278 if (ref == null){
1279 ref = getReferenceService().find(uuidRef);
1280 state.putReference(uuidRef, ref);
1281 }
1282 return ref;
1283 }
1284
1285 private MarkerType markerTypeFossil(){
1286 if(this.markerTypeFossil == null){
1287 markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1288 getTermService().save(this.markerTypeFossil);
1289 }
1290 return markerTypeFossil;
1291 }
1292
1293 private String csvReportLine(String regId, String message, String ... fields){
1294 StringBuilder out = new StringBuilder("regID#");
1295 out.append(regId).append(",\"").append(message).append('"');
1296
1297 for(String f : fields){
1298 out.append(",\"").append(f).append('"');
1299 }
1300 return out.toString();
1301 }
1302
1303
1304 }