ref #6026 FieldUnit locality working
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.iapt;
11
12 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13 import eu.etaxonomy.cdm.common.CdmUtils;
14 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16 import eu.etaxonomy.cdm.model.agent.Institution;
17 import eu.etaxonomy.cdm.model.agent.Person;
18 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
19 import eu.etaxonomy.cdm.model.common.*;
20 import eu.etaxonomy.cdm.model.name.*;
21 import eu.etaxonomy.cdm.model.occurrence.*;
22 import eu.etaxonomy.cdm.model.occurrence.Collection;
23 import eu.etaxonomy.cdm.model.reference.Reference;
24 import eu.etaxonomy.cdm.model.reference.ReferenceType;
25 import eu.etaxonomy.cdm.model.taxon.*;
26 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
27 import org.apache.commons.lang.ArrayUtils;
28 import org.apache.commons.lang.StringEscapeUtils;
29 import org.apache.commons.lang.StringUtils;
30 import org.apache.log4j.Level;
31 import org.apache.log4j.Logger;
32 import org.joda.time.DateTimeFieldType;
33 import org.joda.time.Partial;
34 import org.joda.time.format.DateTimeFormat;
35 import org.joda.time.format.DateTimeFormatter;
36 import org.springframework.stereotype.Component;
37
38 import java.util.*;
39 import java.util.regex.Matcher;
40 import java.util.regex.Pattern;
41
42 /**
43 * @author a.mueller
44 * @created 05.01.2016
45 */
46
47 @Component("iAPTExcelImport")
48 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
49 private static final long serialVersionUID = -747486709409732371L;
50 private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
51 public static final String ANNOTATION_MARKER_STRING = "[*]";
52
53
54 private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
55
56 private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
57
58 private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
59 private final static String HIGHERTAXON= "HigherTaxon";
60 private final static String FULLNAME= "FullName";
61 private final static String AUTHORSSPELLING= "AuthorsSpelling";
62 private final static String LITSTRING= "LitString";
63 private final static String REGISTRATION= "Registration";
64 private final static String TYPE= "Type";
65 private final static String CAVEATS= "Caveats";
66 private final static String FULLBASIONYM= "FullBasionym";
67 private final static String FULLSYNSUBST= "FullSynSubst";
68 private final static String NOTESTXT= "NotesTxt";
69 private final static String REGDATE= "RegDate";
70 private final static String NAMESTRING= "NameString";
71 private final static String BASIONYMSTRING= "BasionymString";
72 private final static String SYNSUBSTSTR= "SynSubstStr";
73 private final static String AUTHORSTRING= "AuthorString";
74
75 private static List<String> expectedKeys= Arrays.asList(new String[]{
76 REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
77
78 private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)\\.?$");
79 private static final Pattern[] datePatterns = new Pattern[]{
80 // NOTE:
81 // The order of the patterns is extremely important!!!
82 //
83 // all patterns cover the years 1700 - 1999
84 Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
85 Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
86 Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
87 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
88 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
89 Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
90 Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
91 Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
92 Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
93 Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
94 };
95 private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
96
97 private static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
98 private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
99 private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
100
101 // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
102 private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
103
104 private static final Pattern[] specimenTypePatterns = new Pattern[]{
105 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
106 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
107 Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
108 Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
109 };
110
111 private static Map<String, Integer> monthFromNameMap = new HashMap<>();
112
113 static {
114 String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
115 String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
116 String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
117 String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
118 String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
119 String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
120 String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
121 String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
122 String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
123 String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
124
125 String[][] perLang = new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
126
127 for (String[] months: perLang) {
128 for(int m = 1; m < 13; m++){
129 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
130 }
131 }
132
133 // special cases
134 monthFromNameMap.put("mar", 3);
135 monthFromNameMap.put("dec", 12);
136 monthFromNameMap.put("Februari", 2);
137 }
138
139
140 DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
141
142 private Map<String, Collection> collectionMap = new HashMap<>();
143
144
145 enum TypesName {
146 fieldUnit, holotype, isotype;
147
148 public SpecimenTypeDesignationStatus status(){
149 switch (this) {
150 case holotype:
151 return SpecimenTypeDesignationStatus.HOLOTYPE();
152 case isotype:
153 return SpecimenTypeDesignationStatus.ISOTYPE();
154 default:
155 return null;
156 }
157 }
158 }
159
160 private MarkerType markerTypeFossil = null;
161 private Rank rankUnrankedSupraGeneric = null;
162 private Rank familyIncertisSedis = null;
163 private AnnotationType annotationTypeCaveats = null;
164
165 private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
166 TaxonNode higherTaxonNode, boolean isFossil) {
167
168 String regNumber = getValue(record, REGISTRATIONNO_PK, false);
169 String regStr = getValue(record, REGISTRATION, true);
170 String titleCacheStr = getValue(record, FULLNAME, true);
171 String nameStr = getValue(record, NAMESTRING, true);
172 String authorStr = getValue(record, AUTHORSTRING, true);
173 String nomRefStr = getValue(record, LITSTRING, true);
174 String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
175 String notesTxt = getValue(record, NOTESTXT, true);
176 String caveats = getValue(record, CAVEATS, true);
177 String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
178 String fullBasionymStr = getValue(record, FULLBASIONYM, true);
179 String basionymNameStr = getValue(record, FULLBASIONYM, true);
180 String synSubstStr = getValue(record, SYNSUBSTSTR, true);
181 String typeStr = getValue(record, TYPE, true);
182
183
184 String nomRefTitle = null;
185 String nomRefDetail;
186 String nomRefPupDate = null;
187 String nomRefIssue = null;
188 Partial pupDate = null;
189
190 // preprocess nomRef: separate citation, reference detail, publishing date
191 if(!StringUtils.isEmpty(nomRefStr)){
192 nomRefStr = nomRefStr.trim();
193 Matcher m = nomRefTokenizeP.matcher(nomRefStr);
194 if(m.matches()){
195 nomRefTitle = m.group("title");
196 nomRefDetail = m.group("detail");
197 nomRefPupDate = m.group("date").trim();
198 nomRefIssue = m.group("issue");
199
200 pupDate = parseDate(regNumber, nomRefPupDate);
201 if (pupDate != null) {
202 nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
203 } else {
204 logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
205 }
206 } else {
207 nomRefTitle = nomRefStr;
208 }
209 }
210
211 BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
212
213 // always add the original strings of parsed data as annotation
214 taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
215 "\n - '" + LITSTRING + "': "+ nomRefStr +
216 "\n - '" + TYPE + "': " + typeStr +
217 "\n - '" + REGISTRATION + "': " + regStr
218 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
219
220 if(pupDate != null) {
221 taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
222 }
223 if(nomRefIssue != null) {
224 taxonName.getNomenclaturalReference().setType(ReferenceType.Book);
225 ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
226 }
227
228 if(!StringUtils.isEmpty(notesTxt)){
229 notesTxt = notesTxt.replace("Notes: ", "").trim();
230 taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
231 }
232 if(!StringUtils.isEmpty(caveats)){
233 caveats = caveats.replace("Caveats: ", "").trim();
234 taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
235 }
236
237 getNameService().save(taxonName);
238
239 // Namerelations
240 if(!StringUtils.isEmpty(authorsSpelling)){
241 authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
242
243 String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
244 String[] nameStrTokens = StringUtils.split(nameStr, " ");
245
246 ArrayUtils.reverse(authorSpellingTokens);
247 ArrayUtils.reverse(nameStrTokens);
248
249 for (int i = 0; i < nameStrTokens.length; i++){
250 if(i < authorSpellingTokens.length){
251 nameStrTokens[i] = authorSpellingTokens[i];
252 }
253 }
254 ArrayUtils.reverse(nameStrTokens);
255
256 String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
257 // build the fullnameString of the misspelled name
258 misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
259
260 TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
261 misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
262 getNameService().save(misspelledName);
263 }
264
265 // Replaced Synonyms
266 if(!StringUtils.isEmpty(fullSynSubstStr)){
267 fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
268 BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
269 replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
270 getNameService().save(replacedSynonymName);
271 }
272
273 Reference sec = state.getConfig().getSecReference();
274 Taxon taxon = Taxon.NewInstance(taxonName, sec);
275
276 // Basionym
277 if(fullBasionymStr != null){
278 fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
279 BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
280 getNameService().save(basionym);
281 taxonName.addBasionym(basionym);
282
283 Synonym syn = Synonym.NewInstance(basionym, sec);
284 taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
285 getTaxonService().save(syn);
286 }
287
288 // Markers
289 if(isFossil){
290 taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
291 }
292
293 // Types
294 if(!StringUtils.isEmpty(typeStr)){
295 makeTypeData(typeStr, taxonName, regNumber, state);
296 }
297
298 getTaxonService().save(taxon);
299 if(higherTaxonNode != null){
300 higherTaxonNode.addChildTaxon(taxon, null, null);
301 getTaxonNodeService().save(higherTaxonNode);
302 }
303
304 return taxon;
305 }
306
307 private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
308
309 Matcher m = typeSplitPattern.matcher(typeStr);
310
311 if(m.matches()){
312 String fieldUnitStr = m.group(TypesName.fieldUnit.name());
313 // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
314 FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
315 if(fieldUnit == null) {
316 // create a field unit with only a titleCache using the fieldUnitStr substring
317 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
318 fieldUnit = FieldUnit.NewInstance();
319 fieldUnit.setTitleCache(fieldUnitStr, true);
320 getOccurrenceService().save(fieldUnit);
321 }
322 getOccurrenceService().save(fieldUnit);
323
324 // all others ..
325 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
326 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
327
328 } else {
329 // create a field unit with only a titleCache using the full typeStr
330 FieldUnit fieldUnit = FieldUnit.NewInstance();
331 fieldUnit.setTitleCache(typeStr, true);
332 getOccurrenceService().save(fieldUnit);
333 logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
334 }
335 getNameService().save(taxonName);
336 }
337
338 /**
339 * Currently only parses the collector, fieldNumber and the collection date.
340 *
341 * @param fieldUnitStr
342 * @param regNumber
343 * @param state
344 * @return null if the fieldUnitStr could not be parsed
345 */
346 private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
347
348 FieldUnit fieldUnit = null;
349
350 Matcher m1 = collectorPattern.matcher(fieldUnitStr);
351 if(m1.matches()){
352
353 String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
354 String removal = m1.group(1);
355 if(collectorData == null){
356 collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
357 removal = m1.group(3);
358 }
359 if(collectorData == null){
360 return null;
361 }
362
363 // the fieldUnitStr is parsable
364 // remove all collectorData from the fieldUnitStr and use the rest as locality
365 String locality = fieldUnitStr.replace(removal, "");
366
367 String collectorStr = null;
368 String detailStr = null;
369 Partial date = null;
370 String fieldNumber = null;
371
372 Matcher m2 = collectionDataPattern.matcher(collectorData);
373 if(m2.matches()){
374 collectorStr = m2.group("collector");
375 detailStr = m2.group("detail");
376
377 // Try to make sense of the detailStr
378 if(detailStr != null){
379 detailStr = detailStr.trim();
380 // 1. try to parse as date
381 date = parseDate(regNumber, detailStr);
382 if(date == null){
383 // 2. try to parse as number
384 if(collectorsNumber.matcher(detailStr).matches()){
385 fieldNumber = detailStr;
386 }
387 }
388 }
389 if(date == null && fieldNumber == null){
390 // detailed parsing not possible, so need fo fallback
391 collectorStr = collectorData;
392 }
393 }
394
395 if(collectorStr == null) {
396 collectorStr = collectorData;
397 }
398
399 fieldUnit = FieldUnit.NewInstance();
400 GatheringEvent ge = GatheringEvent.NewInstance();
401 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
402
403 TeamOrPersonBase agent = state.getAgentBase(collectorStr);
404 if(agent == null) {
405 agent = Person.NewTitledInstance(collectorStr);
406 getAgentService().save(agent);
407 state.putAgentBase(collectorStr, agent);
408 }
409 ge.setCollector(agent);
410
411 if(date != null){
412 ge.setGatheringDate(date);
413 }
414
415 getEventBaseService().save(ge);
416 fieldUnit.setGatheringEvent(ge);
417
418 if(fieldNumber != null) {
419 fieldUnit.setFieldNumber(fieldNumber);
420 }
421 getOccurrenceService().save(fieldUnit);
422
423 }
424
425 return fieldUnit;
426 }
427
428 private Partial parseDate(String regNumber, String dateStr) {
429
430 Partial pupDate = null;
431 boolean parseError = false;
432
433 String day = null;
434 String month = null;
435 String monthName = null;
436 String year = null;
437
438 for(Pattern p : datePatterns){
439 Matcher m2 = p.matcher(dateStr);
440 if(m2.matches()){
441 try {
442 year = m2.group("year");
443 } catch (IllegalArgumentException e){
444 // named capture group not found
445 }
446 try {
447 month = m2.group("month");
448 } catch (IllegalArgumentException e){
449 // named capture group not found
450 }
451
452 try {
453 monthName = m2.group("monthName");
454 month = monthFromName(monthName, regNumber);
455 if(month == null){
456 parseError = true;
457 }
458 } catch (IllegalArgumentException e){
459 // named capture group not found
460 }
461 try {
462 day = m2.group("day");
463 } catch (IllegalArgumentException e){
464 // named capture group not found
465 }
466
467 if(year != null){
468 if (year.length() == 2) {
469 // it is an abbreviated year from the 19** years
470 year = "19" + year;
471 }
472 break;
473 } else {
474 parseError = true;
475 }
476 }
477 }
478 if(year == null){
479 parseError = true;
480 }
481 List<DateTimeFieldType> types = new ArrayList<>();
482 List<Integer> values = new ArrayList<>();
483 if(!parseError) {
484 types.add(DateTimeFieldType.year());
485 values.add(Integer.parseInt(year));
486 if (month != null) {
487 types.add(DateTimeFieldType.monthOfYear());
488 values.add(Integer.parseInt(month));
489 }
490 if (day != null) {
491 types.add(DateTimeFieldType.dayOfMonth());
492 values.add(Integer.parseInt(day));
493 }
494 pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
495 }
496 return pupDate;
497 }
498
499 private String monthFromName(String monthName, String regNumber) {
500
501 Integer month = monthFromNameMap.get(monthName.toLowerCase());
502 if(month == null){
503 logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
504 return null;
505 } else {
506 return month.toString();
507 }
508 }
509
510
511 private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
512
513 if(StringUtils.isEmpty(typeStr)){
514 return;
515 }
516 typeStr = typeStr.trim().replaceAll("\\.$", "");
517
518 Collection collection = null;
519 DerivedUnit specimen = null;
520
521 List<DerivedUnit> specimens = new ArrayList<>();
522 if(multiple){
523 String[] tokens = typeStr.split("\\s?,\\s?");
524 for (String t : tokens) {
525 // command to list all complex parsabel types:
526 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
527 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
528
529 if(!t.isEmpty()){
530 // trying to parse the string
531 specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
532 if(specimen != null){
533 specimens.add(specimen);
534 } else {
535 // parsing was not successful make simple specimen
536 specimens.add(makeSpecimenType(fieldUnit, t));
537 }
538 }
539 }
540 } else {
541 specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
542 if(specimen != null) {
543 specimens.add(specimen);
544 // remember current collection
545 collection = specimen.getCollection();
546 } else {
547 // parsing was not successful make simple specimen
548 specimens.add(makeSpecimenType(fieldUnit, typeStr));
549 }
550 }
551
552 for(DerivedUnit s : specimens){
553 taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
554 }
555 }
556
557 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
558 DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
559 facade.setTitleCache(titleCache.trim(), true);
560 specimen = facade.innerDerivedUnit();
561 return specimen;
562 }
563
564 /**
565 *
566 * @param fieldUnit
567 * @param typeName
568 * @param collection
569 * @param text
570 * @param regNumber
571 * @return
572 */
573 private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
574
575 DerivedUnit specimen = null;
576
577 String collectionCode = null;
578 String subCollectionStr = null;
579 String instituteStr = null;
580 String accessionNumber = null;
581
582 boolean unusualAccessionNumber = false;
583
584 text = text.trim();
585
586 // 1. For Isotypes often the accession number is noted alone if the
587 // preceeding entry has a collection code.
588 if(typeName .equals(TypesName.isotype) && collection != null){
589 Matcher m = accessionNumberOnlyPattern.matcher(text);
590 if(m.matches()){
591 try {
592 accessionNumber = m.group("accNumber");
593 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
594 } catch (IllegalArgumentException e){
595 // match group acc_number not found
596 }
597 }
598 }
599
600 //2. try it the 'normal' way
601 if(specimen == null) {
602 for (Pattern p : specimenTypePatterns) {
603 Matcher m = p.matcher(text);
604 if (m.matches()) {
605 // collection code is mandatory
606 try {
607 collectionCode = m.group("colCode");
608 } catch (IllegalArgumentException e){
609 // match group colCode not found
610 }
611 try {
612 subCollectionStr = m.group("subCollection");
613 } catch (IllegalArgumentException e){
614 // match group subCollection not found
615 }
616 try {
617 instituteStr = m.group("institute");
618 } catch (IllegalArgumentException e){
619 // match group col_name not found
620 }
621 try {
622 accessionNumber = m.group("accNumber");
623
624 // try to improve the accessionNumber
625 if(accessionNumber!= null) {
626 accessionNumber = accessionNumber.trim();
627 Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
628 String betterAccessionNumber = null;
629 if (m2.matches()) {
630 try {
631 betterAccessionNumber = m.group("accNumber");
632 } catch (IllegalArgumentException e) {
633 // match group acc_number not found
634 }
635 }
636 if (betterAccessionNumber != null) {
637 accessionNumber = betterAccessionNumber;
638 } else {
639 unusualAccessionNumber = true;
640 }
641 }
642
643 } catch (IllegalArgumentException e){
644 // match group acc_number not found
645 }
646
647 if(collectionCode == null && instituteStr == null){
648 logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
649 continue;
650 }
651 collection = getCollection(collectionCode, instituteStr, subCollectionStr);
652 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
653 break;
654 }
655 }
656 }
657 if(specimen == null) {
658 logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
659 }
660 if(unusualAccessionNumber){
661 logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
662 }
663 return specimen;
664 }
665
666 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
667
668 DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
669 facade.setCollection(collection);
670 if(accessionNumber != null){
671 facade.setAccessionNumber(accessionNumber);
672 }
673 return facade.innerDerivedUnit();
674 }
675
676 private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
677 String authorStr, String nomRefTitle) {
678
679 BotanicalName taxonName;// cache field for the taxonName.titleCache
680 String taxonNameTitleCache = null;
681 Map<String, AnnotationType> nameAnnotations = new HashMap<>();
682
683 // TitleCache preprocessing
684 if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
685 nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
686 titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
687 if(authorStr != null) {
688 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
689 }
690 }
691
692 // parse the full taxon name
693 if(!StringUtils.isEmpty(nomRefTitle)){
694 String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
695 String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
696 logger.debug(":::::" + taxonFullNameStr);
697 taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
698 } else {
699 taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
700 }
701
702 taxonNameTitleCache = taxonName.getTitleCache().trim();
703 if (taxonName.isProtectedTitleCache()) {
704 logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
705 } else {
706
707 boolean doRestoreTitleCacheStr = false;
708
709 // Check if titleCache and nameCache are plausible
710 String titleCacheCompareStr = titleCacheStr;
711 String nameCache = taxonName.getNameCache();
712 String nameCompareStr = nameStr;
713 if(taxonName.isBinomHybrid()){
714 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
715 nameCompareStr = nameCompareStr.replace(" x ", " ×");
716 }
717 if(taxonName.isMonomHybrid()){
718 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
719 nameCompareStr = nameCompareStr.replace("^X ", "× ");
720 }
721 if(authorStr != null && authorStr.contains(" et ")){
722 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
723 }
724 if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
725 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
726 doRestoreTitleCacheStr = true;
727 }
728 if (!nameCache.trim().equals(nameCompareStr)) {
729 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
730 }
731
732 // Author
733 //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
734 //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
735 // logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
736 // doRestoreTitleCacheStr = true;
737 //}
738
739 if(doRestoreTitleCacheStr){
740 taxonName.setTitleCache(titleCacheStr, true);
741 }
742
743 // deduplicate
744 replaceAuthorNamesAndNomRef(state, taxonName);
745 }
746
747 // Annotations
748 if(!nameAnnotations.isEmpty()){
749 for(String text : nameAnnotations.keySet()){
750 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
751 }
752 getNameService().save(taxonName);
753 }
754 return taxonName;
755 }
756
757 /**
758 * @param state
759 * @return
760 */
761 private TaxonNode getClassificationRootNode(IAPTImportState state) {
762
763 // Classification classification = state.getClassification();
764 // if (classification == null){
765 // IAPTImportConfigurator config = state.getConfig();
766 // classification = Classification.NewInstance(state.getConfig().getClassificationName());
767 // classification.setUuid(config.getClassificationUuid());
768 // classification.setReference(config.getSecReference());
769 // classification = getClassificationService().find(state.getConfig().getClassificationUuid());
770 // }
771 TaxonNode rootNode = state.getRootNode();
772 if (rootNode == null){
773 rootNode = getTaxonNodeService().find(ROOT_UUID);
774 }
775 if (rootNode == null){
776 Classification classification = state.getClassification();
777 if (classification == null){
778 Reference sec = state.getSecReference();
779 String classificationName = state.getConfig().getClassificationName();
780 Language language = Language.DEFAULT();
781 classification = Classification.NewInstance(classificationName, sec, language);
782 state.setClassification(classification);
783 classification.setUuid(state.getConfig().getClassificationUuid());
784 classification.getRootNode().setUuid(ROOT_UUID);
785 getClassificationService().save(classification);
786 }
787 rootNode = classification.getRootNode();
788 state.setRootNode(rootNode);
789 }
790 return rootNode;
791 }
792
793 private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
794
795 Collection superCollection = null;
796 if(subCollectionStr != null){
797 superCollection = getCollection(collectionCode, instituteStr, null);
798 collectionCode = subCollectionStr;
799 instituteStr = null;
800 }
801
802 final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
803
804 Collection collection = collectionMap.get(key);
805
806 if(collection == null) {
807 collection = Collection.NewInstance();
808 collection.setCode(collectionCode);
809 if(instituteStr != null){
810 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
811 }
812 if(superCollection != null){
813 collection.setSuperCollection(superCollection);
814 }
815 collectionMap.put(key, collection);
816 getCollectionService().save(collection);
817 }
818
819 return collection;
820 }
821
822
823 /**
824 * @param record
825 * @param originalKey
826 * @param doUnescapeHtmlEntities
827 * @return
828 */
829 private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
830 String value = record.get(originalKey);
831
832 value = fixCharacters(value);
833
834 if (! StringUtils.isBlank(value)) {
835 if (logger.isDebugEnabled()) {
836 logger.debug(originalKey + ": " + value);
837 }
838 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
839 if(doUnescapeHtmlEntities){
840 value = StringEscapeUtils.unescapeHtml(value);
841 }
842 return value.trim();
843 }else{
844 return null;
845 }
846 }
847
848 /**
849 * Fixes broken characters.
850 * For details see
851 * http://dev.e-taxonomy.eu/redmine/issues/6035
852 *
853 * @param value
854 * @return
855 */
856 private String fixCharacters(String value) {
857
858 value = StringUtils.replace(value, "s$K", "š");
859 value = StringUtils.replace(value, "n$K", "ň");
860 value = StringUtils.replace(value, "e$K", "ě");
861 value = StringUtils.replace(value, "r$K", "ř");
862 value = StringUtils.replace(value, "c$K", "č");
863 value = StringUtils.replace(value, "z$K", "ž");
864 value = StringUtils.replace(value, "S>U$K", "Š");
865 value = StringUtils.replace(value, "C>U$K", "Č");
866 value = StringUtils.replace(value, "R>U$K", "Ř");
867 value = StringUtils.replace(value, "Z>U$K", "Ž");
868 value = StringUtils.replace(value, "g$K", "ǧ");
869 value = StringUtils.replace(value, "s$A", "ś");
870 value = StringUtils.replace(value, "n$A", "ń");
871 value = StringUtils.replace(value, "c$A", "ć");
872 value = StringUtils.replace(value, "e$E", "ę");
873 value = StringUtils.replace(value, "o$H", "õ");
874 value = StringUtils.replace(value, "s$C", "ş");
875 value = StringUtils.replace(value, "t$C", "ț");
876 value = StringUtils.replace(value, "S>U$C", "Ş");
877 value = StringUtils.replace(value, "a$O", "å");
878 value = StringUtils.replace(value, "A>U$O", "Å");
879 value = StringUtils.replace(value, "u$O", "ů");
880 value = StringUtils.replace(value, "g$B", "ğ");
881 value = StringUtils.replace(value, "g$B", "ĕ");
882 value = StringUtils.replace(value, "a$B", "ă");
883 value = StringUtils.replace(value, "l$/", "ł");
884 value = StringUtils.replace(value, ">i", "ı");
885 value = StringUtils.replace(value, "i$U", "ï");
886 // Special-cases
887 value = StringUtils.replace(value, "&yacute", "ý");
888 value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
889 value = StringUtils.replace(value, "E>U$D", "З");
890 value = StringUtils.replace(value, "S>U$E", "Ş");
891 value = StringUtils.replace(value, "s$E", "ş");
892
893 value = StringUtils.replace(value, "c$k", "č");
894 value = StringUtils.replace(value, " U$K", " Š");
895
896 return value;
897 }
898
899
900 /**
901 * Stores taxa records in DB
902 */
903 @Override
904 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
905
906 String lineNumber = "L#" + state.getCurrentLine() + ": ";
907 logger.setLevel(Level.DEBUG);
908 HashMap<String, String> record = state.getOriginalRecord();
909 logger.debug(lineNumber + record.toString());
910
911 Set<String> keys = record.keySet();
912 for (String key: keys) {
913 if (! expectedKeys.contains(key)){
914 logger.warn(lineNumber + "Unexpected Key: " + key);
915 }
916 }
917
918 String reg_id = record.get(REGISTRATIONNO_PK);
919
920 //higherTaxon
921 String higherTaxaString = record.get(HIGHERTAXON);
922 boolean isFossil = false;
923 if(higherTaxaString.startsWith("FOSSIL ")){
924 higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
925 isFossil = true;
926 }
927 TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
928
929 //Taxon
930 Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
931 if (taxon == null){
932 logger.warn(lineNumber + "taxon could not be created and is null");
933 return;
934 }
935 ((IAPTImportState)state).setCurrentTaxon(taxon);
936
937
938 return;
939 }
940
941 private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
942 String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
943 TaxonNode higherTaxonNode = null;
944
945 ITaxonTreeNode rootNode = getClassificationRootNode(state);
946 for (String htn : higherTaxaNames) {
947 htn = StringUtils.capitalize(htn.trim());
948 Taxon higherTaxon = state.getHigherTaxon(htn);
949 if (higherTaxon != null){
950 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
951 }else{
952 BotanicalName name = makeHigherTaxonName(state, htn);
953 Reference sec = state.getSecReference();
954 higherTaxon = Taxon.NewInstance(name, sec);
955 getTaxonService().save(higherTaxon);
956 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
957 state.putHigherTaxon(htn, higherTaxon);
958 getClassificationService().saveTreeNode(higherTaxonNode);
959 }
960 rootNode = higherTaxonNode;
961 }
962 return higherTaxonNode;
963 }
964
965 private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
966
967 Rank rank = guessRank(name);
968
969 BotanicalName taxonName = BotanicalName.NewInstance(rank);
970 taxonName.addSource(makeOriginalSource(state));
971 taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
972 return taxonName;
973 }
974
975 private Rank guessRank(String name) {
976
977 // normalize
978 name = name.replaceAll("\\(.*\\)", "").trim();
979
980 if(name.matches("^Plantae$|^Fungi$")){
981 return Rank.KINGDOM();
982 } else if(name.matches("^Incertae sedis$|^No group assigned$")){
983 return rankFamilyIncertisSedis();
984 } else if(name.matches(".*phyta$|.*mycota$")){
985 return Rank.SECTION_BOTANY();
986 } else if(name.matches(".*phytina$|.*mycotina$")){
987 return Rank.SUBSECTION_BOTANY();
988 } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
989 return rankUnrankedSupraGeneric();
990 } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
991 return Rank.CLASS();
992 } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
993 return Rank.SUBCLASS();
994 } else if(name.matches(".*ales$")){
995 return Rank.ORDER();
996 } else if(name.matches(".*ineae$")){
997 return Rank.SUBORDER();
998 } else if(name.matches(".*aceae$")){
999 return Rank.FAMILY();
1000 } else if(name.matches(".*oideae$")){
1001 return Rank.SUBFAMILY();
1002 } else
1003 // if(name.matches(".*eae$")){
1004 // return Rank.TRIBE();
1005 // } else
1006 if(name.matches(".*inae$")){
1007 return Rank.SUBTRIBE();
1008 } else if(name.matches(".*ae$")){
1009 return Rank.FAMILY();
1010 }
1011 return Rank.UNKNOWN_RANK();
1012 }
1013
1014 private Rank rankUnrankedSupraGeneric() {
1015
1016 if(rankUnrankedSupraGeneric == null){
1017 rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1018 getTermService().save(rankUnrankedSupraGeneric);
1019 }
1020 return rankUnrankedSupraGeneric;
1021 }
1022
1023 private Rank rankFamilyIncertisSedis() {
1024
1025 if(familyIncertisSedis == null){
1026 familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1027 getTermService().save(familyIncertisSedis);
1028 }
1029 return familyIncertisSedis;
1030 }
1031
1032 private AnnotationType annotationTypeCaveats(){
1033 if(annotationTypeCaveats == null){
1034 annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1035 getTermService().save(annotationTypeCaveats);
1036 }
1037 return annotationTypeCaveats;
1038 }
1039
1040
1041 /**
1042 * @param state
1043 * @return
1044 */
1045 private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1046 return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1047 }
1048
1049
1050 private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1051 Reference ref = state.getReference(uuidRef);
1052 if (ref == null){
1053 ref = getReferenceService().find(uuidRef);
1054 state.putReference(uuidRef, ref);
1055 }
1056 return ref;
1057 }
1058
1059 private MarkerType markerTypeFossil(){
1060 if(this.markerTypeFossil == null){
1061 markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1062 getTermService().save(this.markerTypeFossil);
1063 }
1064 return markerTypeFossil;
1065 }
1066
1067 private String csvReportLine(String regId, String message, String ... fields){
1068 StringBuilder out = new StringBuilder("regID#");
1069 out.append(regId).append(",\"").append(message).append('"');
1070
1071 for(String f : fields){
1072 out.append(",\"").append(f).append('"');
1073 }
1074 return out.toString();
1075 }
1076
1077
1078 }