14 |
14 |
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
|
15 |
15 |
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
|
16 |
16 |
import eu.etaxonomy.cdm.model.agent.Institution;
|
|
17 |
import eu.etaxonomy.cdm.model.agent.Person;
|
|
18 |
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
|
17 |
19 |
import eu.etaxonomy.cdm.model.common.*;
|
18 |
20 |
import eu.etaxonomy.cdm.model.name.*;
|
19 |
21 |
import eu.etaxonomy.cdm.model.occurrence.*;
|
... | ... | |
73 |
75 |
REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
|
74 |
76 |
|
75 |
77 |
private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
|
76 |
|
private static final Pattern[] nomRefPubDatePs = new Pattern[]{
|
|
78 |
private static final Pattern[] datePatterns = new Pattern[]{
|
77 |
79 |
// NOTE:
|
78 |
80 |
// The order of the patterns is extremely important!!!
|
79 |
81 |
//
|
80 |
82 |
// all patterns cover the years 1700 - 1999
|
81 |
83 |
Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
|
82 |
84 |
Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
|
83 |
|
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
|
84 |
|
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
|
|
85 |
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
|
|
86 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
|
85 |
87 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
|
86 |
|
Pattern.compile("^(?:(?<day>[0-9]{1,2})\\sde\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
|
|
88 |
Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
|
87 |
89 |
Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
|
88 |
90 |
Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
|
89 |
|
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
|
|
91 |
Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
|
|
92 |
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
|
90 |
93 |
};
|
91 |
|
private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
|
|
94 |
private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
|
|
95 |
|
|
96 |
private static final Pattern collectorPattern = Pattern.compile(".*?\\(leg\\.\\s+([^\\)]*)\\)|.*?\\sleg\\.\\s+(.*?)\\.?$");
|
|
97 |
private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
|
|
98 |
private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
|
92 |
99 |
|
93 |
100 |
// AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
|
94 |
101 |
private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
|
... | ... | |
96 |
103 |
private static final Pattern[] specimenTypePatterns = new Pattern[]{
|
97 |
104 |
Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
|
98 |
105 |
Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
|
99 |
|
Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*)\\2(?<accNumber>.*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
|
|
106 |
Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
|
100 |
107 |
Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
|
101 |
108 |
};
|
102 |
109 |
|
... | ... | |
135 |
142 |
|
136 |
143 |
|
137 |
144 |
enum TypesName {
|
138 |
|
type, holotype, isotype;
|
|
145 |
fieldUnit, holotype, isotype;
|
139 |
146 |
|
140 |
147 |
public SpecimenTypeDesignationStatus status(){
|
141 |
148 |
switch (this) {
|
... | ... | |
187 |
194 |
nomRefDetail = m.group(2);
|
188 |
195 |
nomRefPupDate = m.group(3).trim();
|
189 |
196 |
|
190 |
|
pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate);
|
|
197 |
pupDate = parseDate(regNumber, nomRefPupDate);
|
191 |
198 |
if (pupDate != null) {
|
192 |
199 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
|
|
200 |
} else {
|
|
201 |
logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
|
193 |
202 |
}
|
194 |
203 |
} else {
|
195 |
204 |
nomRefTitle = nomRefStr;
|
... | ... | |
263 |
272 |
|
264 |
273 |
// Types
|
265 |
274 |
if(!StringUtils.isEmpty(typeStr)){
|
266 |
|
makeTypeData(typeStr, taxonName, regNumber);
|
|
275 |
makeTypeData(typeStr, taxonName, regNumber, state);
|
267 |
276 |
}
|
268 |
277 |
|
269 |
278 |
getTaxonService().save(taxon);
|
... | ... | |
276 |
285 |
|
277 |
286 |
}
|
278 |
287 |
|
279 |
|
private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber) {
|
|
288 |
private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
|
280 |
289 |
|
281 |
290 |
Matcher m = typeSplitPattern.matcher(typeStr);
|
282 |
291 |
|
283 |
292 |
if(m.matches()){
|
284 |
|
String typeString = m.group(TypesName.type.name());
|
285 |
|
boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
|
286 |
|
|
287 |
|
if(isFieldUnit) {
|
288 |
|
// type as fieldUnit
|
289 |
|
FieldUnit fu = FieldUnit.NewInstance();
|
290 |
|
fu.setTitleCache(typeString, true);
|
291 |
|
getOccurrenceService().save(fu);
|
292 |
|
|
293 |
|
// all others ..
|
294 |
|
addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
|
295 |
|
addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
|
296 |
|
} else {
|
297 |
|
TaxonNameBase typeName = nameParser.parseFullName(typeString);
|
298 |
|
taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
|
|
293 |
String fieldUnitStr = m.group(TypesName.fieldUnit.name());
|
|
294 |
// boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
|
|
295 |
FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
|
|
296 |
if(fieldUnit == null) {
|
|
297 |
// create a field unit with only a titleCache using the fieldUnitStr substring
|
|
298 |
fieldUnit = FieldUnit.NewInstance();
|
|
299 |
fieldUnit.setTitleCache(fieldUnitStr, true);
|
|
300 |
getOccurrenceService().save(fieldUnit);
|
299 |
301 |
}
|
|
302 |
getOccurrenceService().save(fieldUnit);
|
|
303 |
|
|
304 |
// all others ..
|
|
305 |
addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
|
|
306 |
addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
|
|
307 |
|
|
308 |
} else {
|
|
309 |
// create a field unit with only a titleCache using the full typeStr
|
|
310 |
FieldUnit fieldUnit = FieldUnit.NewInstance();
|
|
311 |
fieldUnit.setTitleCache(typeStr, true);
|
|
312 |
getOccurrenceService().save(fieldUnit);
|
|
313 |
logger.warn(csvReportLine(regNumber, "Type field can not be parsed", typeStr));
|
300 |
314 |
}
|
301 |
315 |
getNameService().save(taxonName);
|
302 |
316 |
}
|
303 |
317 |
|
304 |
|
private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
|
|
318 |
/**
|
|
319 |
* Currently only parses the collector, fieldNumber and the collection date.
|
|
320 |
*
|
|
321 |
* @param fieldUnitStr
|
|
322 |
* @param regNumber
|
|
323 |
* @param state
|
|
324 |
* @return null if the fieldUnitStr could not be parsed
|
|
325 |
*/
|
|
326 |
private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
|
|
327 |
|
|
328 |
FieldUnit fieldUnit = null;
|
|
329 |
|
|
330 |
Matcher m1 = collectorPattern.matcher(fieldUnitStr);
|
|
331 |
if(m1.matches()){
|
|
332 |
String collectionData = m1.group(1); // like (leg. Metzeltin, 30. 9. 1996)
|
|
333 |
if(collectionData == null){
|
|
334 |
collectionData = m1.group(2); // like leg. Metzeltin, 30. 9. 1996
|
|
335 |
}
|
|
336 |
if(collectionData == null){
|
|
337 |
return null;
|
|
338 |
}
|
|
339 |
|
|
340 |
String collectorStr = null;
|
|
341 |
String detailStr = null;
|
|
342 |
Partial date = null;
|
|
343 |
String fieldNumber = null;
|
|
344 |
|
|
345 |
Matcher m2 = collectionDataPattern.matcher(collectionData);
|
|
346 |
if(m2.matches()){
|
|
347 |
collectorStr = m2.group("collector");
|
|
348 |
detailStr = m2.group("detail");
|
|
349 |
|
|
350 |
// Try to make sense of the detailStr
|
|
351 |
if(detailStr != null){
|
|
352 |
detailStr = detailStr.trim();
|
|
353 |
// 1. try to parse as date
|
|
354 |
date = parseDate(regNumber, detailStr);
|
|
355 |
if(date == null){
|
|
356 |
// 2. try to parse as number
|
|
357 |
if(collectorsNumber.matcher(detailStr).matches()){
|
|
358 |
fieldNumber = detailStr;
|
|
359 |
}
|
|
360 |
}
|
|
361 |
}
|
|
362 |
if(date == null && fieldNumber == null){
|
|
363 |
// detailed parsing not possible, so need fo fallback
|
|
364 |
collectorStr = collectionData;
|
|
365 |
}
|
|
366 |
}
|
|
367 |
|
|
368 |
if(collectorStr != null) {
|
|
369 |
fieldUnit = FieldUnit.NewInstance();
|
|
370 |
GatheringEvent ge = GatheringEvent.NewInstance();
|
|
371 |
|
|
372 |
TeamOrPersonBase agent = state.getAgentBase(collectorStr);
|
|
373 |
if(agent == null) {
|
|
374 |
agent = Person.NewTitledInstance(collectorStr);
|
|
375 |
getAgentService().save(agent);
|
|
376 |
state.putAgentBase(collectorStr, agent);
|
|
377 |
}
|
|
378 |
ge.setCollector(agent);
|
|
379 |
|
|
380 |
if(date != null){
|
|
381 |
ge.setGatheringDate(date);
|
|
382 |
}
|
|
383 |
|
|
384 |
getEventBaseService().save(ge);
|
|
385 |
fieldUnit.setGatheringEvent(ge);
|
|
386 |
|
|
387 |
if(fieldNumber != null) {
|
|
388 |
fieldUnit.setFieldNumber(fieldNumber);
|
|
389 |
}
|
|
390 |
getOccurrenceService().save(fieldUnit);
|
|
391 |
}
|
|
392 |
}
|
|
393 |
|
|
394 |
return fieldUnit;
|
|
395 |
}
|
|
396 |
|
|
397 |
private Partial parseDate(String regNumber, String dateStr) {
|
305 |
398 |
|
306 |
399 |
Partial pupDate = null;
|
307 |
400 |
boolean parseError = false;
|
308 |
|
String nomRefPupDay = null;
|
309 |
|
String nomRefPupMonth = null;
|
310 |
|
String nomRefPupMonthName = null;
|
311 |
|
String nomRefPupYear = null;
|
312 |
401 |
|
|
402 |
String day = null;
|
|
403 |
String month = null;
|
|
404 |
String monthName = null;
|
|
405 |
String year = null;
|
313 |
406 |
|
314 |
|
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
|
315 |
|
for(Pattern p : nomRefPubDatePs){
|
316 |
|
Matcher m2 = p.matcher(nomRefPupDate);
|
|
407 |
for(Pattern p : datePatterns){
|
|
408 |
Matcher m2 = p.matcher(dateStr);
|
317 |
409 |
if(m2.matches()){
|
318 |
410 |
try {
|
319 |
|
nomRefPupYear = m2.group("year");
|
|
411 |
year = m2.group("year");
|
320 |
412 |
} catch (IllegalArgumentException e){
|
321 |
413 |
// named capture group not found
|
322 |
414 |
}
|
323 |
415 |
try {
|
324 |
|
nomRefPupMonth = m2.group("month");
|
|
416 |
month = m2.group("month");
|
325 |
417 |
} catch (IllegalArgumentException e){
|
326 |
418 |
// named capture group not found
|
327 |
419 |
}
|
|
420 |
|
328 |
421 |
try {
|
329 |
|
nomRefPupMonthName = m2.group("monthName");
|
330 |
|
nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
|
331 |
|
if(nomRefPupMonth == null){
|
|
422 |
monthName = m2.group("monthName");
|
|
423 |
month = monthFromName(monthName, regNumber);
|
|
424 |
if(month == null){
|
332 |
425 |
parseError = true;
|
333 |
426 |
}
|
334 |
427 |
} catch (IllegalArgumentException e){
|
335 |
428 |
// named capture group not found
|
336 |
429 |
}
|
337 |
430 |
try {
|
338 |
|
nomRefPupDay = m2.group("day");
|
|
431 |
day = m2.group("day");
|
339 |
432 |
} catch (IllegalArgumentException e){
|
340 |
433 |
// named capture group not found
|
341 |
434 |
}
|
342 |
435 |
|
343 |
|
if(nomRefPupYear == null){
|
344 |
|
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" );
|
|
436 |
if(year != null){
|
|
437 |
if (year.length() == 2) {
|
|
438 |
// it is an abbreviated year from the 19** years
|
|
439 |
year = "19" + year;
|
|
440 |
}
|
|
441 |
break;
|
|
442 |
} else {
|
345 |
443 |
parseError = true;
|
346 |
444 |
}
|
347 |
|
if(nomRefPupYear.length() == 2 ){
|
348 |
|
// it is an abbreviated year from the 19** years
|
349 |
|
nomRefPupYear = "19" + nomRefPupYear;
|
350 |
|
}
|
351 |
|
|
352 |
|
break;
|
353 |
445 |
}
|
354 |
446 |
}
|
355 |
|
if(nomRefPupYear == null){
|
356 |
|
logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
|
|
447 |
if(year == null){
|
357 |
448 |
parseError = true;
|
358 |
449 |
}
|
359 |
450 |
List<DateTimeFieldType> types = new ArrayList<>();
|
360 |
451 |
List<Integer> values = new ArrayList<>();
|
361 |
452 |
if(!parseError) {
|
362 |
453 |
types.add(DateTimeFieldType.year());
|
363 |
|
values.add(Integer.parseInt(nomRefPupYear));
|
364 |
|
if (nomRefPupMonth != null) {
|
|
454 |
values.add(Integer.parseInt(year));
|
|
455 |
if (month != null) {
|
365 |
456 |
types.add(DateTimeFieldType.monthOfYear());
|
366 |
|
values.add(Integer.parseInt(nomRefPupMonth));
|
|
457 |
values.add(Integer.parseInt(month));
|
367 |
458 |
}
|
368 |
|
if (nomRefPupDay != null) {
|
|
459 |
if (day != null) {
|
369 |
460 |
types.add(DateTimeFieldType.dayOfMonth());
|
370 |
|
values.add(Integer.parseInt(nomRefPupDay));
|
|
461 |
values.add(Integer.parseInt(day));
|
371 |
462 |
}
|
372 |
463 |
pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
|
373 |
464 |
}
|
... | ... | |
484 |
575 |
try {
|
485 |
576 |
collectionCode = m.group("colCode");
|
486 |
577 |
} catch (IllegalArgumentException e){
|
487 |
|
logger.warn(csvReportLine(regNumber, "match group colCode not found"));
|
488 |
|
continue;
|
|
578 |
// match group colCode not found
|
489 |
579 |
}
|
490 |
580 |
try {
|
491 |
581 |
subCollectionStr = m.group("subCollection");
|
... | ... | |
523 |
613 |
// match group acc_number not found
|
524 |
614 |
}
|
525 |
615 |
|
|
616 |
if(collectionCode == null && instituteStr == null){
|
|
617 |
logger.warn(csvReportLine(regNumber, "neither 'collectionCode' nor 'institute' found in ", text));
|
|
618 |
continue;
|
|
619 |
}
|
526 |
620 |
collection = getCollection(collectionCode, instituteStr, subCollectionStr);
|
527 |
621 |
specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
|
528 |
622 |
break;
|
... | ... | |
530 |
624 |
}
|
531 |
625 |
}
|
532 |
626 |
if(specimen == null) {
|
533 |
|
logger.warn(csvReportLine(regNumber, "Could not parse specimen type", typeName.name().toString(), text));
|
|
627 |
logger.warn(csvReportLine(regNumber, "Could not parse specimen fieldUnit", typeName.name().toString(), text));
|
534 |
628 |
}
|
535 |
629 |
if(unusualAccessionNumber){
|
536 |
630 |
logger.warn(csvReportLine(regNumber, "Unusual accession number", typeName.name().toString(), text, accessionNumber));
|
... | ... | |
542 |
636 |
|
543 |
637 |
DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
|
544 |
638 |
facade.setCollection(collection);
|
545 |
|
facade.setAccessionNumber(accessionNumber);
|
|
639 |
if(accessionNumber != null){
|
|
640 |
facade.setAccessionNumber(accessionNumber);
|
|
641 |
}
|
546 |
642 |
return facade.innerDerivedUnit();
|
547 |
643 |
}
|
548 |
644 |
|
ref #6026 better type parsing