package eu.etaxonomy.cdm.io.iapt;
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
+import eu.etaxonomy.cdm.api.service.pager.Pager;
import eu.etaxonomy.cdm.common.CdmUtils;
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
import eu.etaxonomy.cdm.model.occurrence.*;
import eu.etaxonomy.cdm.model.occurrence.Collection;
import eu.etaxonomy.cdm.model.reference.Reference;
+import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
+import eu.etaxonomy.cdm.model.reference.ReferenceType;
import eu.etaxonomy.cdm.model.taxon.*;
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
+import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
private static List<String> expectedKeys= Arrays.asList(new String[]{
REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
- private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
+ private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
private static final Pattern[] datePatterns = new Pattern[]{
// NOTE:
// The order of the patterns is extremely important!!!
Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
- Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
- Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
+ Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
+ Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
- Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
+ Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
};
- private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
+ private static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
- private static final Pattern collectorPattern = Pattern.compile(".*?\\(leg\\.\\s+([^\\)]*)\\)|.*?\\sleg\\.\\s+(.*?)\\.?$");
+ private static final Pattern typeNameBasionymPattern = Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
+ private static final Pattern typeNameNotePattern = Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
+ private static final Pattern typeNameSpecialSplitPattern = Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
+
+ private static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
private static final Pattern[] specimenTypePatterns = new Pattern[]{
Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
- Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+ Pattern.compile("^(?:in\\s)?(?<institute>[Cc]oll\\.\\s.*?)(?:\\s+(?<accNumber>(Praep\\.|slide|No\\.|Inv\\. Nr\\.|Nr\\.).*))?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
+ Pattern.compile("^(?<institute>Inst\\.\\s.*?)\\s+(?<accNumber>N\\s.*)?$"), // like Inst. Geological Sciences, Acad. Sci. Belarus, Minsk N 212 A
Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
};
private Rank familyIncertisSedis = null;
private AnnotationType annotationTypeCaveats = null;
+ private Reference bookVariedadesTradicionales = null;
+
+ /**
+ * HACK for unit simple testing
+ */
+ boolean _testMode = System.getProperty("TEST_MODE") != null;
+
private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
TaxonNode higherTaxonNode, boolean isFossil) {
- String line = state.getCurrentLine() + ": ";
-
String regNumber = getValue(record, REGISTRATIONNO_PK, false);
String regStr = getValue(record, REGISTRATION, true);
String titleCacheStr = getValue(record, FULLNAME, true);
String nomRefTitle = null;
String nomRefDetail;
String nomRefPupDate = null;
+ String nomRefIssue = null;
Partial pupDate = null;
+ boolean restoreOriginalReference = false;
+ boolean nameIsValid = true;
+
// preprocess nomRef: separate citation, reference detail, publishing date
if(!StringUtils.isEmpty(nomRefStr)){
nomRefStr = nomRefStr.trim();
+
+ // handle the special case which is hard to parse:
+ //
+ // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
+ if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
+
+ if(bookVariedadesTradicionales == null){
+ bookVariedadesTradicionales = ReferenceFactory.newBook();
+ bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
+ bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
+ getReferenceService().save(bookVariedadesTradicionales);
+ }
+ nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
+ restoreOriginalReference = true;
+ }
+
Matcher m = nomRefTokenizeP.matcher(nomRefStr);
if(m.matches()){
- nomRefTitle = m.group(1);
- nomRefDetail = m.group(2);
- nomRefPupDate = m.group(3).trim();
+ nomRefTitle = m.group("title");
+ nomRefDetail = m.group("detail");
+ nomRefPupDate = m.group("date").trim();
+ nomRefIssue = m.group("issue");
pupDate = parseDate(regNumber, nomRefPupDate);
if (pupDate != null) {
"\n - '" + REGISTRATION + "': " + regStr
, AnnotationType.TECHNICAL(), Language.DEFAULT()));
+ if(restoreOriginalReference){
+ taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
+ }
if(pupDate != null) {
taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
}
+ if(nomRefIssue != null) {
+ ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
+ }
+
if(!StringUtils.isEmpty(notesTxt)){
notesTxt = notesTxt.replace("Notes: ", "").trim();
taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
+ nameIsValid = false;
+
}
if(!StringUtils.isEmpty(caveats)){
caveats = caveats.replace("Caveats: ", "").trim();
taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
+ nameIsValid = false;
+ }
+
+ if(nameIsValid){
+ // Status is always considered valid if no notes and cavets are set
+ taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
}
getNameService().save(taxonName);
// Basionym
if(fullBasionymStr != null){
fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
+ basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
getNameService().save(basionym);
taxonName.addBasionym(basionym);
// Types
if(!StringUtils.isEmpty(typeStr)){
- makeTypeData(typeStr, taxonName, regNumber, state);
+
+ if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
+ makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
+ } else {
+ makeNameTypeData(typeStr, taxonName, regNumber, state);
+ }
}
getTaxonService().save(taxon);
+
+ if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
+ // try to find the genus, it should have been imported already, Genera are coming first in the import file
+ Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
+ if(genus != null){
+ higherTaxonNode = genus.getTaxonNodes().iterator().next();
+ } else {
+ logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
+ }
+ }
+
if(higherTaxonNode != null){
higherTaxonNode.addChildTaxon(taxon, null, null);
getTaxonNodeService().save(higherTaxonNode);
}
- return taxon;
+ if(taxonName.getRank().isGenus()){
+ ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
+ }
+ return taxon;
}
- private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+ private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
- Matcher m = typeSplitPattern.matcher(typeStr);
+ Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
if(m.matches()){
String fieldUnitStr = m.group(TypesName.fieldUnit.name());
FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
if(fieldUnit == null) {
// create a field unit with only a titleCache using the fieldUnitStr substring
- logger.warn(csvReportLine(regNumber, "Type: fielUnitStr can not be parsed", fieldUnitStr));
+ logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
fieldUnit = FieldUnit.NewInstance();
fieldUnit.setTitleCache(fieldUnitStr, true);
getOccurrenceService().save(fieldUnit);
getNameService().save(taxonName);
}
+ private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
+
+ String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
+ if(nameStr.isEmpty()) {
+ return;
+ }
+
+ String basionymNameStr = null;
+ String noteStr = null;
+ String agentStr = null;
+
+ Matcher m;
+
+ if(typeStr.startsWith("not to be indicated")){
+ // Special case:
+ // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
+ // FIXME
+ m = typeNameSpecialSplitPattern.matcher(nameStr);
+ if(m.matches()){
+ nameStr = m.group("name");
+ noteStr = m.group("note");
+ agentStr = m.group("agent");
+ // TODO better import of agent?
+ if(agentStr != null){
+ noteStr = noteStr + ": " + agentStr;
+ }
+ }
+ } else {
+ // Generic case
+ m = typeNameBasionymPattern.matcher(nameStr);
+ if (m.find()) {
+ basionymNameStr = m.group("basionymName");
+ if (basionymNameStr != null) {
+ nameStr = nameStr.replace(m.group(0), "");
+ }
+ }
+
+ m = typeNameNotePattern.matcher(nameStr);
+ if (m.find()) {
+ noteStr = m.group(1);
+ if (noteStr != null) {
+ nameStr = nameStr.replace(m.group(0), "");
+ }
+ }
+ }
+
+ BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+
+ if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
+ logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
+ }
+
+ if(basionymNameStr != null){
+ BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
+ getNameService().save(basionymName);
+ typeName.addBasionym(basionymName);
+ }
+
+
+ NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
+ nameTypeDesignation.setTypeName(typeName);
+ getNameService().save(typeName);
+
+ if(noteStr != null){
+ nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
+ }
+ taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
+
+ }
+
/**
* Currently only parses the collector, fieldNumber and the collection date.
*
Matcher m1 = collectorPattern.matcher(fieldUnitStr);
if(m1.matches()){
- String collectionData = m1.group(1); // like (leg. Metzeltin, 30. 9. 1996)
- if(collectionData == null){
- collectionData = m1.group(2); // like leg. Metzeltin, 30. 9. 1996
+
+ String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
+ String removal = m1.group(1);
+ if(collectorData == null){
+ collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
+ removal = m1.group(3);
}
- if(collectionData == null){
+ if(collectorData == null){
return null;
}
+ // the fieldUnitStr is parsable
+ // remove all collectorData from the fieldUnitStr and use the rest as locality
+ String locality = fieldUnitStr.replace(removal, "");
+
String collectorStr = null;
String detailStr = null;
Partial date = null;
String fieldNumber = null;
- Matcher m2 = collectionDataPattern.matcher(collectionData);
+ Matcher m2 = collectionDataPattern.matcher(collectorData);
if(m2.matches()){
collectorStr = m2.group("collector");
detailStr = m2.group("detail");
}
if(date == null && fieldNumber == null){
// detailed parsing not possible, so need fo fallback
- collectorStr = collectionData;
+ collectorStr = collectorData;
}
}
- if(collectorStr != null) {
- fieldUnit = FieldUnit.NewInstance();
- GatheringEvent ge = GatheringEvent.NewInstance();
+ if(collectorStr == null) {
+ collectorStr = collectorData;
+ }
- TeamOrPersonBase agent = state.getAgentBase(collectorStr);
- if(agent == null) {
- agent = Person.NewTitledInstance(collectorStr);
- getAgentService().save(agent);
- state.putAgentBase(collectorStr, agent);
- }
- ge.setCollector(agent);
+ fieldUnit = FieldUnit.NewInstance();
+ GatheringEvent ge = GatheringEvent.NewInstance();
+ ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
- if(date != null){
- ge.setGatheringDate(date);
- }
+ TeamOrPersonBase agent = state.getAgentBase(collectorStr);
+ if(agent == null) {
+ agent = Person.NewTitledInstance(collectorStr);
+ getAgentService().save(agent);
+ state.putAgentBase(collectorStr, agent);
+ }
+ ge.setCollector(agent);
- getEventBaseService().save(ge);
- fieldUnit.setGatheringEvent(ge);
+ if(date != null){
+ ge.setGatheringDate(date);
+ }
- if(fieldNumber != null) {
- fieldUnit.setFieldNumber(fieldNumber);
- }
- getOccurrenceService().save(fieldUnit);
+ getEventBaseService().save(ge);
+ fieldUnit.setGatheringEvent(ge);
+
+ if(fieldNumber != null) {
+ fieldUnit.setFieldNumber(fieldNumber);
}
+ getOccurrenceService().save(fieldUnit);
+
}
return fieldUnit;
}
- private Partial parseDate(String regNumber, String dateStr) {
+ protected Partial parseDate(String regNumber, String dateStr) {
Partial pupDate = null;
boolean parseError = false;
* @param regNumber
* @return
*/
- private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
+ protected DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
DerivedUnit specimen = null;
String collectionCode = null;
+ String collectionTitle = null;
String subCollectionStr = null;
String instituteStr = null;
String accessionNumber = null;
for (Pattern p : specimenTypePatterns) {
Matcher m = p.matcher(text);
if (m.matches()) {
- // collection code is mandatory
+ // collection code or collectionTitle is mandatory
try {
collectionCode = m.group("colCode");
} catch (IllegalArgumentException e){
// match group colCode not found
}
+
try {
- subCollectionStr = m.group("subCollection");
+ instituteStr = m.group("institute");
} catch (IllegalArgumentException e){
- // match group subCollection not found
+ // match group col_name not found
}
+
try {
- instituteStr = m.group("institute");
+ subCollectionStr = m.group("subCollection");
} catch (IllegalArgumentException e){
- // match group col_name not found
+ // match group subCollection not found
}
try {
accessionNumber = m.group("accNumber");
for(String text : nameAnnotations.keySet()){
taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
}
- getNameService().save(taxonName);
}
+
+ taxonName.addSource(OriginalSourceType.Import, regNumber, null, state.getConfig().getSourceReference(), null);
+
+ getNameService().save(taxonName);
+
return taxonName;
}
collection.setSuperCollection(superCollection);
}
collectionMap.put(key, collection);
- getCollectionService().save(collection);
+ if(!_testMode) {
+ getCollectionService().save(collection);
+ }
}
return collection;
value = StringUtils.replace(value, "c$k", "č");
value = StringUtils.replace(value, " U$K", " Š");
+ value = StringUtils.replace(value, "O>U>!", "Ø");
+ value = StringUtils.replace(value, "o>!", "ø");
+ value = StringUtils.replace(value, "S$K", "Ŝ");
+ value = StringUtils.replace(value, ">l", "ğ");
+
+ value = StringUtils.replace(value, "§B>i", "ł");
+
+
+
return value;
}
((IAPTImportState)state).setCurrentTaxon(taxon);
+ logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
return;
}
} else if(name.matches("^Incertae sedis$|^No group assigned$")){
return rankFamilyIncertisSedis();
} else if(name.matches(".*phyta$|.*mycota$")){
- return Rank.SECTION_BOTANY();
+ return Rank.PHYLUM();
} else if(name.matches(".*phytina$|.*mycotina$")){
- return Rank.SUBSECTION_BOTANY();
+ return Rank.SUBPHYLUM();
} else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
return rankUnrankedSupraGeneric();
} else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){