ref #6026 better type parsing
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.iapt;
11
12 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13 import eu.etaxonomy.cdm.common.CdmUtils;
14 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16 import eu.etaxonomy.cdm.model.agent.Institution;
17 import eu.etaxonomy.cdm.model.agent.Person;
18 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
19 import eu.etaxonomy.cdm.model.common.*;
20 import eu.etaxonomy.cdm.model.name.*;
21 import eu.etaxonomy.cdm.model.occurrence.*;
22 import eu.etaxonomy.cdm.model.occurrence.Collection;
23 import eu.etaxonomy.cdm.model.reference.Reference;
24 import eu.etaxonomy.cdm.model.taxon.*;
25 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
26 import org.apache.commons.lang.ArrayUtils;
27 import org.apache.commons.lang.StringEscapeUtils;
28 import org.apache.commons.lang.StringUtils;
29 import org.apache.log4j.Level;
30 import org.apache.log4j.Logger;
31 import org.joda.time.DateTimeFieldType;
32 import org.joda.time.Partial;
33 import org.joda.time.format.DateTimeFormat;
34 import org.joda.time.format.DateTimeFormatter;
35 import org.springframework.stereotype.Component;
36
37 import java.util.*;
38 import java.util.regex.Matcher;
39 import java.util.regex.Pattern;
40
41 /**
42 * @author a.mueller
43 * @created 05.01.2016
44 */
45
46 @Component("iAPTExcelImport")
47 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
48 private static final long serialVersionUID = -747486709409732371L;
49 private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
50 public static final String ANNOTATION_MARKER_STRING = "[*]";
51
52
53 private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
54
55 private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
56
57 private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
58 private final static String HIGHERTAXON= "HigherTaxon";
59 private final static String FULLNAME= "FullName";
60 private final static String AUTHORSSPELLING= "AuthorsSpelling";
61 private final static String LITSTRING= "LitString";
62 private final static String REGISTRATION= "Registration";
63 private final static String TYPE= "Type";
64 private final static String CAVEATS= "Caveats";
65 private final static String FULLBASIONYM= "FullBasionym";
66 private final static String FULLSYNSUBST= "FullSynSubst";
67 private final static String NOTESTXT= "NotesTxt";
68 private final static String REGDATE= "RegDate";
69 private final static String NAMESTRING= "NameString";
70 private final static String BASIONYMSTRING= "BasionymString";
71 private final static String SYNSUBSTSTR= "SynSubstStr";
72 private final static String AUTHORSTRING= "AuthorString";
73
74 private static List<String> expectedKeys= Arrays.asList(new String[]{
75 REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
76
77 private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
78 private static final Pattern[] datePatterns = new Pattern[]{
79 // NOTE:
80 // The order of the patterns is extremely important!!!
81 //
82 // all patterns cover the years 1700 - 1999
83 Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
84 Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
85 Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
86 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
87 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
88 Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
89 Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
90 Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
91 Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
92 Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
93 };
94 private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
95
96 private static final Pattern collectorPattern = Pattern.compile(".*?\\(leg\\.\\s+([^\\)]*)\\)|.*?\\sleg\\.\\s+(.*?)\\.?$");
97 private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
98 private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
99
100 // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
101 private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
102
103 private static final Pattern[] specimenTypePatterns = new Pattern[]{
104 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
105 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
106 Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
107 Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
108 };
109
110 private static Map<String, Integer> monthFromNameMap = new HashMap<>();
111
112 static {
113 String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
114 String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
115 String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
116 String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
117 String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
118 String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
119 String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
120 String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
121 String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
122 String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
123
124 String[][] perLang = new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
125
126 for (String[] months: perLang) {
127 for(int m = 1; m < 13; m++){
128 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
129 }
130 }
131
132 // special cases
133 monthFromNameMap.put("mar", 3);
134 monthFromNameMap.put("dec", 12);
135 monthFromNameMap.put("Februari", 2);
136 }
137
138
139 DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
140
141 private Map<String, Collection> collectionMap = new HashMap<>();
142
143
144 enum TypesName {
145 fieldUnit, holotype, isotype;
146
147 public SpecimenTypeDesignationStatus status(){
148 switch (this) {
149 case holotype:
150 return SpecimenTypeDesignationStatus.HOLOTYPE();
151 case isotype:
152 return SpecimenTypeDesignationStatus.ISOTYPE();
153 default:
154 return null;
155 }
156 }
157 }
158
159 private MarkerType markerTypeFossil = null;
160 private Rank rankUnrankedSupraGeneric = null;
161 private Rank familyIncertisSedis = null;
162 private AnnotationType annotationTypeCaveats = null;
163
164 private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
165 TaxonNode higherTaxonNode, boolean isFossil) {
166
167 String line = state.getCurrentLine() + ": ";
168
169 String regNumber = getValue(record, REGISTRATIONNO_PK, false);
170 String regStr = getValue(record, REGISTRATION, true);
171 String titleCacheStr = getValue(record, FULLNAME, true);
172 String nameStr = getValue(record, NAMESTRING, true);
173 String authorStr = getValue(record, AUTHORSTRING, true);
174 String nomRefStr = getValue(record, LITSTRING, true);
175 String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
176 String notesTxt = getValue(record, NOTESTXT, true);
177 String caveats = getValue(record, CAVEATS, true);
178 String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
179 String synSubstStr = getValue(record, SYNSUBSTSTR, true);
180 String typeStr = getValue(record, TYPE, true);
181
182
183 String nomRefTitle = null;
184 String nomRefDetail;
185 String nomRefPupDate = null;
186 Partial pupDate = null;
187
188 // preprocess nomRef: separate citation, reference detail, publishing date
189 if(!StringUtils.isEmpty(nomRefStr)){
190 nomRefStr = nomRefStr.trim();
191 Matcher m = nomRefTokenizeP.matcher(nomRefStr);
192 if(m.matches()){
193 nomRefTitle = m.group(1);
194 nomRefDetail = m.group(2);
195 nomRefPupDate = m.group(3).trim();
196
197 pupDate = parseDate(regNumber, nomRefPupDate);
198 if (pupDate != null) {
199 nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
200 } else {
201 logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
202 }
203 } else {
204 nomRefTitle = nomRefStr;
205 }
206 }
207
208 BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
209
210 // always add the original strings of parsed data as annotation
211 taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
212 "\n - '" + LITSTRING + "': "+ nomRefStr +
213 "\n - '" + TYPE + "': " + typeStr +
214 "\n - '" + REGISTRATION + "': " + regStr
215 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
216
217 if(pupDate != null) {
218 taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
219 }
220
221 if(!StringUtils.isEmpty(notesTxt)){
222 notesTxt = notesTxt.replace("Notes: ", "").trim();
223 taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
224 }
225 if(!StringUtils.isEmpty(caveats)){
226 caveats = caveats.replace("Caveats: ", "").trim();
227 taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
228 }
229 //
230
231 // Namerelations
232 if(!StringUtils.isEmpty(authorsSpelling)){
233 authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
234
235 String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
236 String[] nameStrTokens = StringUtils.split(nameStr, " ");
237
238 ArrayUtils.reverse(authorSpellingTokens);
239 ArrayUtils.reverse(nameStrTokens);
240
241 for (int i = 0; i < nameStrTokens.length; i++){
242 if(i < authorSpellingTokens.length){
243 nameStrTokens[i] = authorSpellingTokens[i];
244 }
245 }
246 ArrayUtils.reverse(nameStrTokens);
247
248 String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
249 // build the fullnameString of the misspelled name
250 misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
251
252 TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
253 misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
254 getNameService().save(misspelledName);
255 }
256
257 // Replaced Synonyms
258 if(!StringUtils.isEmpty(fullSynSubstStr)){
259 fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
260 BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
261 replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
262 getNameService().save(replacedSynonymName);
263 }
264
265 Reference sec = state.getConfig().getSecReference();
266 Taxon taxon = Taxon.NewInstance(taxonName, sec);
267
268 // Markers
269 if(isFossil){
270 taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
271 }
272
273 // Types
274 if(!StringUtils.isEmpty(typeStr)){
275 makeTypeData(typeStr, taxonName, regNumber, state);
276 }
277
278 getTaxonService().save(taxon);
279 if(higherTaxonNode != null){
280 higherTaxonNode.addChildTaxon(taxon, null, null);
281 getTaxonNodeService().save(higherTaxonNode);
282 }
283
284 return taxon;
285
286 }
287
288 private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
289
290 Matcher m = typeSplitPattern.matcher(typeStr);
291
292 if(m.matches()){
293 String fieldUnitStr = m.group(TypesName.fieldUnit.name());
294 // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
295 FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
296 if(fieldUnit == null) {
297 // create a field unit with only a titleCache using the fieldUnitStr substring
298 fieldUnit = FieldUnit.NewInstance();
299 fieldUnit.setTitleCache(fieldUnitStr, true);
300 getOccurrenceService().save(fieldUnit);
301 }
302 getOccurrenceService().save(fieldUnit);
303
304 // all others ..
305 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
306 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
307
308 } else {
309 // create a field unit with only a titleCache using the full typeStr
310 FieldUnit fieldUnit = FieldUnit.NewInstance();
311 fieldUnit.setTitleCache(typeStr, true);
312 getOccurrenceService().save(fieldUnit);
313 logger.warn(csvReportLine(regNumber, "Type field can not be parsed", typeStr));
314 }
315 getNameService().save(taxonName);
316 }
317
318 /**
319 * Currently only parses the collector, fieldNumber and the collection date.
320 *
321 * @param fieldUnitStr
322 * @param regNumber
323 * @param state
324 * @return null if the fieldUnitStr could not be parsed
325 */
326 private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
327
328 FieldUnit fieldUnit = null;
329
330 Matcher m1 = collectorPattern.matcher(fieldUnitStr);
331 if(m1.matches()){
332 String collectionData = m1.group(1); // like (leg. Metzeltin, 30. 9. 1996)
333 if(collectionData == null){
334 collectionData = m1.group(2); // like leg. Metzeltin, 30. 9. 1996
335 }
336 if(collectionData == null){
337 return null;
338 }
339
340 String collectorStr = null;
341 String detailStr = null;
342 Partial date = null;
343 String fieldNumber = null;
344
345 Matcher m2 = collectionDataPattern.matcher(collectionData);
346 if(m2.matches()){
347 collectorStr = m2.group("collector");
348 detailStr = m2.group("detail");
349
350 // Try to make sense of the detailStr
351 if(detailStr != null){
352 detailStr = detailStr.trim();
353 // 1. try to parse as date
354 date = parseDate(regNumber, detailStr);
355 if(date == null){
356 // 2. try to parse as number
357 if(collectorsNumber.matcher(detailStr).matches()){
358 fieldNumber = detailStr;
359 }
360 }
361 }
362 if(date == null && fieldNumber == null){
363 // detailed parsing not possible, so need fo fallback
364 collectorStr = collectionData;
365 }
366 }
367
368 if(collectorStr != null) {
369 fieldUnit = FieldUnit.NewInstance();
370 GatheringEvent ge = GatheringEvent.NewInstance();
371
372 TeamOrPersonBase agent = state.getAgentBase(collectorStr);
373 if(agent == null) {
374 agent = Person.NewTitledInstance(collectorStr);
375 getAgentService().save(agent);
376 state.putAgentBase(collectorStr, agent);
377 }
378 ge.setCollector(agent);
379
380 if(date != null){
381 ge.setGatheringDate(date);
382 }
383
384 getEventBaseService().save(ge);
385 fieldUnit.setGatheringEvent(ge);
386
387 if(fieldNumber != null) {
388 fieldUnit.setFieldNumber(fieldNumber);
389 }
390 getOccurrenceService().save(fieldUnit);
391 }
392 }
393
394 return fieldUnit;
395 }
396
397 private Partial parseDate(String regNumber, String dateStr) {
398
399 Partial pupDate = null;
400 boolean parseError = false;
401
402 String day = null;
403 String month = null;
404 String monthName = null;
405 String year = null;
406
407 for(Pattern p : datePatterns){
408 Matcher m2 = p.matcher(dateStr);
409 if(m2.matches()){
410 try {
411 year = m2.group("year");
412 } catch (IllegalArgumentException e){
413 // named capture group not found
414 }
415 try {
416 month = m2.group("month");
417 } catch (IllegalArgumentException e){
418 // named capture group not found
419 }
420
421 try {
422 monthName = m2.group("monthName");
423 month = monthFromName(monthName, regNumber);
424 if(month == null){
425 parseError = true;
426 }
427 } catch (IllegalArgumentException e){
428 // named capture group not found
429 }
430 try {
431 day = m2.group("day");
432 } catch (IllegalArgumentException e){
433 // named capture group not found
434 }
435
436 if(year != null){
437 if (year.length() == 2) {
438 // it is an abbreviated year from the 19** years
439 year = "19" + year;
440 }
441 break;
442 } else {
443 parseError = true;
444 }
445 }
446 }
447 if(year == null){
448 parseError = true;
449 }
450 List<DateTimeFieldType> types = new ArrayList<>();
451 List<Integer> values = new ArrayList<>();
452 if(!parseError) {
453 types.add(DateTimeFieldType.year());
454 values.add(Integer.parseInt(year));
455 if (month != null) {
456 types.add(DateTimeFieldType.monthOfYear());
457 values.add(Integer.parseInt(month));
458 }
459 if (day != null) {
460 types.add(DateTimeFieldType.dayOfMonth());
461 values.add(Integer.parseInt(day));
462 }
463 pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
464 }
465 return pupDate;
466 }
467
468 private String monthFromName(String monthName, String regNumber) {
469
470 Integer month = monthFromNameMap.get(monthName.toLowerCase());
471 if(month == null){
472 logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
473 return null;
474 } else {
475 return month.toString();
476 }
477 }
478
479
480 private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
481
482 if(StringUtils.isEmpty(typeStr)){
483 return;
484 }
485 typeStr = typeStr.trim().replaceAll("\\.$", "");
486
487 Collection collection = null;
488 DerivedUnit specimen = null;
489
490 List<DerivedUnit> specimens = new ArrayList<>();
491 if(multiple){
492 String[] tokens = typeStr.split("\\s?,\\s?");
493 for (String t : tokens) {
494 // command to list all complex parsabel types:
495 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
496 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
497
498 if(!t.isEmpty()){
499 // trying to parse the string
500 specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
501 if(specimen != null){
502 specimens.add(specimen);
503 } else {
504 // parsing was not successful make simple specimen
505 specimens.add(makeSpecimenType(fieldUnit, t));
506 }
507 }
508 }
509 } else {
510 specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
511 if(specimen != null) {
512 specimens.add(specimen);
513 // remember current collection
514 collection = specimen.getCollection();
515 } else {
516 // parsing was not successful make simple specimen
517 specimens.add(makeSpecimenType(fieldUnit, typeStr));
518 }
519 }
520
521 for(DerivedUnit s : specimens){
522 taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
523 }
524 }
525
526 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
527 DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
528 facade.setTitleCache(titleCache.trim(), true);
529 specimen = facade.innerDerivedUnit();
530 return specimen;
531 }
532
533 /**
534 *
535 * @param fieldUnit
536 * @param typeName
537 * @param collection
538 * @param text
539 * @param regNumber
540 * @return
541 */
542 private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
543
544 DerivedUnit specimen = null;
545
546 String collectionCode = null;
547 String subCollectionStr = null;
548 String instituteStr = null;
549 String accessionNumber = null;
550
551 boolean unusualAccessionNumber = false;
552
553 text = text.trim();
554
555 // 1. For Isotypes often the accession number is noted alone if the
556 // preceeding entry has a collection code.
557 if(typeName .equals(TypesName.isotype) && collection != null){
558 Matcher m = accessionNumberOnlyPattern.matcher(text);
559 if(m.matches()){
560 try {
561 accessionNumber = m.group("accNumber");
562 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
563 } catch (IllegalArgumentException e){
564 // match group acc_number not found
565 }
566 }
567 }
568
569 //2. try it the 'normal' way
570 if(specimen == null) {
571 for (Pattern p : specimenTypePatterns) {
572 Matcher m = p.matcher(text);
573 if (m.matches()) {
574 // collection code is mandatory
575 try {
576 collectionCode = m.group("colCode");
577 } catch (IllegalArgumentException e){
578 // match group colCode not found
579 }
580 try {
581 subCollectionStr = m.group("subCollection");
582 } catch (IllegalArgumentException e){
583 // match group subCollection not found
584 }
585 try {
586 instituteStr = m.group("institute");
587 } catch (IllegalArgumentException e){
588 // match group col_name not found
589 }
590 try {
591 accessionNumber = m.group("accNumber");
592
593 // try to improve the accessionNumber
594 if(accessionNumber!= null) {
595 accessionNumber = accessionNumber.trim();
596 Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
597 String betterAccessionNumber = null;
598 if (m2.matches()) {
599 try {
600 betterAccessionNumber = m.group("accNumber");
601 } catch (IllegalArgumentException e) {
602 // match group acc_number not found
603 }
604 }
605 if (betterAccessionNumber != null) {
606 accessionNumber = betterAccessionNumber;
607 } else {
608 unusualAccessionNumber = true;
609 }
610 }
611
612 } catch (IllegalArgumentException e){
613 // match group acc_number not found
614 }
615
616 if(collectionCode == null && instituteStr == null){
617 logger.warn(csvReportLine(regNumber, "neither 'collectionCode' nor 'institute' found in ", text));
618 continue;
619 }
620 collection = getCollection(collectionCode, instituteStr, subCollectionStr);
621 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
622 break;
623 }
624 }
625 }
626 if(specimen == null) {
627 logger.warn(csvReportLine(regNumber, "Could not parse specimen fieldUnit", typeName.name().toString(), text));
628 }
629 if(unusualAccessionNumber){
630 logger.warn(csvReportLine(regNumber, "Unusual accession number", typeName.name().toString(), text, accessionNumber));
631 }
632 return specimen;
633 }
634
635 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
636
637 DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
638 facade.setCollection(collection);
639 if(accessionNumber != null){
640 facade.setAccessionNumber(accessionNumber);
641 }
642 return facade.innerDerivedUnit();
643 }
644
645 private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
646 String authorStr, String nomRefTitle) {
647
648 BotanicalName taxonName;// cache field for the taxonName.titleCache
649 String taxonNameTitleCache = null;
650 Map<String, AnnotationType> nameAnnotations = new HashMap<>();
651
652 // TitleCache preprocessing
653 if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
654 nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
655 titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
656 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
657 }
658
659 // parse the full taxon name
660 if(!StringUtils.isEmpty(nomRefTitle)){
661 String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
662 String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
663 logger.debug(":::::" + taxonFullNameStr);
664 taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
665 } else {
666 taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
667 }
668
669 taxonNameTitleCache = taxonName.getTitleCache().trim();
670 if (taxonName.isProtectedTitleCache()) {
671 logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
672 } else {
673
674 boolean doRestoreTitleCacheStr = false;
675
676 // Check if titleCache and nameCache are plausible
677 String titleCacheCompareStr = titleCacheStr;
678 String nameCache = taxonName.getNameCache();
679 String nameCompareStr = nameStr;
680 if(taxonName.isBinomHybrid()){
681 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
682 nameCompareStr = nameCompareStr.replace(" x ", " ×");
683 }
684 if(taxonName.isMonomHybrid()){
685 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
686 nameCompareStr = nameCompareStr.replace("^X ", "× ");
687 }
688 if(authorStr != null && authorStr.contains(" et ")){
689 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
690 }
691 if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
692 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
693 doRestoreTitleCacheStr = true;
694 }
695 if (!nameCache.trim().equals(nameCompareStr)) {
696 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
697 }
698
699 // Author
700 //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
701 //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
702 // logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
703 // doRestoreTitleCacheStr = true;
704 //}
705
706 if(doRestoreTitleCacheStr){
707 taxonName.setTitleCache(titleCacheStr, true);
708 }
709
710 // deduplicate
711 replaceAuthorNamesAndNomRef(state, taxonName);
712 }
713
714 // Annotations
715 if(!nameAnnotations.isEmpty()){
716 for(String text : nameAnnotations.keySet()){
717 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
718 }
719 getNameService().save(taxonName);
720 }
721 return taxonName;
722 }
723
724 /**
725 * @param state
726 * @return
727 */
728 private TaxonNode getClassificationRootNode(IAPTImportState state) {
729
730 // Classification classification = state.getClassification();
731 // if (classification == null){
732 // IAPTImportConfigurator config = state.getConfig();
733 // classification = Classification.NewInstance(state.getConfig().getClassificationName());
734 // classification.setUuid(config.getClassificationUuid());
735 // classification.setReference(config.getSecReference());
736 // classification = getClassificationService().find(state.getConfig().getClassificationUuid());
737 // }
738 TaxonNode rootNode = state.getRootNode();
739 if (rootNode == null){
740 rootNode = getTaxonNodeService().find(ROOT_UUID);
741 }
742 if (rootNode == null){
743 Classification classification = state.getClassification();
744 if (classification == null){
745 Reference sec = state.getSecReference();
746 String classificationName = state.getConfig().getClassificationName();
747 Language language = Language.DEFAULT();
748 classification = Classification.NewInstance(classificationName, sec, language);
749 state.setClassification(classification);
750 classification.setUuid(state.getConfig().getClassificationUuid());
751 classification.getRootNode().setUuid(ROOT_UUID);
752 getClassificationService().save(classification);
753 }
754 rootNode = classification.getRootNode();
755 state.setRootNode(rootNode);
756 }
757 return rootNode;
758 }
759
760 private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
761
762 Collection superCollection = null;
763 if(subCollectionStr != null){
764 superCollection = getCollection(collectionCode, instituteStr, null);
765 collectionCode = subCollectionStr;
766 instituteStr = null;
767 }
768
769 final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
770
771 Collection collection = collectionMap.get(key);
772
773 if(collection == null) {
774 collection = Collection.NewInstance();
775 collection.setCode(collectionCode);
776 if(instituteStr != null){
777 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
778 }
779 if(superCollection != null){
780 collection.setSuperCollection(superCollection);
781 }
782 collectionMap.put(key, collection);
783 getCollectionService().save(collection);
784 }
785
786 return collection;
787 }
788
789
790 /**
791 * @param record
792 * @param originalKey
793 * @param doUnescapeHtmlEntities
794 * @return
795 */
796 private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
797 String value = record.get(originalKey);
798
799 value = fixCharacters(value);
800
801 if (! StringUtils.isBlank(value)) {
802 if (logger.isDebugEnabled()) {
803 logger.debug(originalKey + ": " + value);
804 }
805 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
806 if(doUnescapeHtmlEntities){
807 value = StringEscapeUtils.unescapeHtml(value);
808 }
809 return value.trim();
810 }else{
811 return null;
812 }
813 }
814
815 /**
816 * Fixes broken characters.
817 * For details see
818 * http://dev.e-taxonomy.eu/redmine/issues/6035
819 *
820 * @param value
821 * @return
822 */
823 private String fixCharacters(String value) {
824
825 value = StringUtils.replace(value, "s$K", "š");
826 value = StringUtils.replace(value, "n$K", "ň");
827 value = StringUtils.replace(value, "e$K", "ě");
828 value = StringUtils.replace(value, "r$K", "ř");
829 value = StringUtils.replace(value, "c$K", "č");
830 value = StringUtils.replace(value, "z$K", "ž");
831 value = StringUtils.replace(value, "S>U$K", "Š");
832 value = StringUtils.replace(value, "C>U$K", "Č");
833 value = StringUtils.replace(value, "R>U$K", "Ř");
834 value = StringUtils.replace(value, "Z>U$K", "Ž");
835 value = StringUtils.replace(value, "g$K", "ǧ");
836 value = StringUtils.replace(value, "s$A", "ś");
837 value = StringUtils.replace(value, "n$A", "ń");
838 value = StringUtils.replace(value, "c$A", "ć");
839 value = StringUtils.replace(value, "e$E", "ę");
840 value = StringUtils.replace(value, "o$H", "õ");
841 value = StringUtils.replace(value, "s$C", "ş");
842 value = StringUtils.replace(value, "t$C", "ț");
843 value = StringUtils.replace(value, "S>U$C", "Ş");
844 value = StringUtils.replace(value, "a$O", "å");
845 value = StringUtils.replace(value, "A>U$O", "Å");
846 value = StringUtils.replace(value, "u$O", "ů");
847 value = StringUtils.replace(value, "g$B", "ğ");
848 value = StringUtils.replace(value, "g$B", "ĕ");
849 value = StringUtils.replace(value, "a$B", "ă");
850 value = StringUtils.replace(value, "l$/", "ł");
851 value = StringUtils.replace(value, ">i", "ı");
852 value = StringUtils.replace(value, "i$U", "ï");
853 // Special-cases
854 value = StringUtils.replace(value, "&yacute", "ý");
855 value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
856 value = StringUtils.replace(value, "E>U$D", "З");
857 value = StringUtils.replace(value, "S>U$E", "Ş");
858 value = StringUtils.replace(value, "s$E", "ş");
859
860 value = StringUtils.replace(value, "c$k", "č");
861 value = StringUtils.replace(value, " U$K", " Š");
862
863 return value;
864 }
865
866
867 /**
868 * Stores taxa records in DB
869 */
870 @Override
871 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
872
873 String lineNumber = "L#" + state.getCurrentLine() + ": ";
874 logger.setLevel(Level.DEBUG);
875 HashMap<String, String> record = state.getOriginalRecord();
876 logger.debug(lineNumber + record.toString());
877
878 Set<String> keys = record.keySet();
879 for (String key: keys) {
880 if (! expectedKeys.contains(key)){
881 logger.warn(lineNumber + "Unexpected Key: " + key);
882 }
883 }
884
885 String reg_id = record.get(REGISTRATIONNO_PK);
886
887 //higherTaxon
888 String higherTaxaString = record.get(HIGHERTAXON);
889 boolean isFossil = false;
890 if(higherTaxaString.startsWith("FOSSIL ")){
891 higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
892 isFossil = true;
893 }
894 TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
895
896 //Taxon
897 Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
898 if (taxon == null){
899 logger.warn(lineNumber + "taxon could not be created and is null");
900 return;
901 }
902 ((IAPTImportState)state).setCurrentTaxon(taxon);
903
904
905 return;
906 }
907
908 private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
909 String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
910 TaxonNode higherTaxonNode = null;
911
912 ITaxonTreeNode rootNode = getClassificationRootNode(state);
913 for (String htn : higherTaxaNames) {
914 htn = StringUtils.capitalize(htn.trim());
915 Taxon higherTaxon = state.getHigherTaxon(htn);
916 if (higherTaxon != null){
917 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
918 }else{
919 BotanicalName name = makeHigherTaxonName(state, htn);
920 Reference sec = state.getSecReference();
921 higherTaxon = Taxon.NewInstance(name, sec);
922 getTaxonService().save(higherTaxon);
923 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
924 state.putHigherTaxon(htn, higherTaxon);
925 getClassificationService().saveTreeNode(higherTaxonNode);
926 }
927 rootNode = higherTaxonNode;
928 }
929 return higherTaxonNode;
930 }
931
932 private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
933
934 Rank rank = guessRank(name);
935
936 BotanicalName taxonName = BotanicalName.NewInstance(rank);
937 taxonName.addSource(makeOriginalSource(state));
938 taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
939 return taxonName;
940 }
941
942 private Rank guessRank(String name) {
943
944 // normalize
945 name = name.replaceAll("\\(.*\\)", "").trim();
946
947 if(name.matches("^Plantae$|^Fungi$")){
948 return Rank.KINGDOM();
949 } else if(name.matches("^Incertae sedis$|^No group assigned$")){
950 return rankFamilyIncertisSedis();
951 } else if(name.matches(".*phyta$|.*mycota$")){
952 return Rank.SECTION_BOTANY();
953 } else if(name.matches(".*phytina$|.*mycotina$")){
954 return Rank.SUBSECTION_BOTANY();
955 } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
956 return rankUnrankedSupraGeneric();
957 } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
958 return Rank.CLASS();
959 } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
960 return Rank.SUBCLASS();
961 } else if(name.matches(".*ales$")){
962 return Rank.ORDER();
963 } else if(name.matches(".*ineae$")){
964 return Rank.SUBORDER();
965 } else if(name.matches(".*aceae$")){
966 return Rank.FAMILY();
967 } else if(name.matches(".*oideae$")){
968 return Rank.SUBFAMILY();
969 } else
970 // if(name.matches(".*eae$")){
971 // return Rank.TRIBE();
972 // } else
973 if(name.matches(".*inae$")){
974 return Rank.SUBTRIBE();
975 } else if(name.matches(".*ae$")){
976 return Rank.FAMILY();
977 }
978 return Rank.UNKNOWN_RANK();
979 }
980
981 private Rank rankUnrankedSupraGeneric() {
982
983 if(rankUnrankedSupraGeneric == null){
984 rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
985 getTermService().save(rankUnrankedSupraGeneric);
986 }
987 return rankUnrankedSupraGeneric;
988 }
989
990 private Rank rankFamilyIncertisSedis() {
991
992 if(familyIncertisSedis == null){
993 familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
994 getTermService().save(familyIncertisSedis);
995 }
996 return familyIncertisSedis;
997 }
998
999 private AnnotationType annotationTypeCaveats(){
1000 if(annotationTypeCaveats == null){
1001 annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1002 getTermService().save(annotationTypeCaveats);
1003 }
1004 return annotationTypeCaveats;
1005 }
1006
1007
1008 /**
1009 * @param state
1010 * @return
1011 */
1012 private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1013 return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1014 }
1015
1016
1017 private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1018 Reference ref = state.getReference(uuidRef);
1019 if (ref == null){
1020 ref = getReferenceService().find(uuidRef);
1021 state.putReference(uuidRef, ref);
1022 }
1023 return ref;
1024 }
1025
1026 private MarkerType markerTypeFossil(){
1027 if(this.markerTypeFossil == null){
1028 markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1029 getTermService().save(this.markerTypeFossil);
1030 }
1031 return markerTypeFossil;
1032 }
1033
1034 private String csvReportLine(String regId, String message, String ... fields){
1035 StringBuilder out = new StringBuilder("regID#");
1036 out.append(regId).append(",\"").append(message).append('"');
1037
1038 for(String f : fields){
1039 out.append(",\"").append(f).append('"');
1040 }
1041 return out.toString();
1042 }
1043
1044
1045 }