ref #6026 'Coll. Lange-Bertalot..' as titleCache of Collection
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / iapt / IAPTExcelImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.iapt;
11
12 import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade;
13 import eu.etaxonomy.cdm.api.service.pager.Pager;
14 import eu.etaxonomy.cdm.common.CdmUtils;
15 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
16 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
17 import eu.etaxonomy.cdm.model.agent.Institution;
18 import eu.etaxonomy.cdm.model.agent.Person;
19 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
20 import eu.etaxonomy.cdm.model.common.*;
21 import eu.etaxonomy.cdm.model.name.*;
22 import eu.etaxonomy.cdm.model.occurrence.*;
23 import eu.etaxonomy.cdm.model.occurrence.Collection;
24 import eu.etaxonomy.cdm.model.reference.Reference;
25 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
26 import eu.etaxonomy.cdm.model.reference.ReferenceType;
27 import eu.etaxonomy.cdm.model.taxon.*;
28 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
29 import eu.etaxonomy.cdm.strategy.parser.ParserProblem;
30 import org.apache.commons.lang.ArrayUtils;
31 import org.apache.commons.lang.StringEscapeUtils;
32 import org.apache.commons.lang.StringUtils;
33 import org.apache.log4j.Level;
34 import org.apache.log4j.Logger;
35 import org.joda.time.DateTimeFieldType;
36 import org.joda.time.Partial;
37 import org.joda.time.format.DateTimeFormat;
38 import org.joda.time.format.DateTimeFormatter;
39 import org.springframework.stereotype.Component;
40
41 import java.util.*;
42 import java.util.regex.Matcher;
43 import java.util.regex.Pattern;
44
45 /**
46 * @author a.mueller
47 * @created 05.01.2016
48 */
49
50 @Component("iAPTExcelImport")
51 public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
52 private static final long serialVersionUID = -747486709409732371L;
53 private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
54 public static final String ANNOTATION_MARKER_STRING = "[*]";
55
56
57 private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
58
59 private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
60
61 private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
62 private final static String HIGHERTAXON= "HigherTaxon";
63 private final static String FULLNAME= "FullName";
64 private final static String AUTHORSSPELLING= "AuthorsSpelling";
65 private final static String LITSTRING= "LitString";
66 private final static String REGISTRATION= "Registration";
67 private final static String TYPE= "Type";
68 private final static String CAVEATS= "Caveats";
69 private final static String FULLBASIONYM= "FullBasionym";
70 private final static String FULLSYNSUBST= "FullSynSubst";
71 private final static String NOTESTXT= "NotesTxt";
72 private final static String REGDATE= "RegDate";
73 private final static String NAMESTRING= "NameString";
74 private final static String BASIONYMSTRING= "BasionymString";
75 private final static String SYNSUBSTSTR= "SynSubstStr";
76 private final static String AUTHORSTRING= "AuthorString";
77
78 private static List<String> expectedKeys= Arrays.asList(new String[]{
79 REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
80
81 private static final Pattern nomRefTokenizeP = Pattern.compile("^(?<title>.*):\\s(?<detail>[^\\.:]+)\\.(?<date>.*?)(?:\\s\\((?<issue>[^\\)]*)\\)\\s*)?\\.?$");
82 private static final Pattern[] datePatterns = new Pattern[]{
83 // NOTE:
84 // The order of the patterns is extremely important!!!
85 //
86 // all patterns cover the years 1700 - 1999
87 Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
88 Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
89 Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
90 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
91 Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
92 Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
93 Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
94 Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
95 Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
96 Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
97 };
98 private static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
99
100 private static final Pattern typeNameBasionymPattern = Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
101 private static final Pattern typeNameNotePattern = Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
102 private static final Pattern typeNameSpecialSplitPattern = Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
103
104 private static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
105 private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
106 private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$");
107
108 // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
109 private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
110
111 private static final Pattern[] specimenTypePatterns = new Pattern[]{
112 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
113 Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
114 Pattern.compile("^(?<Collection>:Coll\\.\\s.*?)\\s(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
115 Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
116 };
117
118 private static Map<String, Integer> monthFromNameMap = new HashMap<>();
119
120 static {
121 String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
122 String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
123 String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
124 String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
125 String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
126 String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
127 String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
128 String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
129 String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
130 String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
131
132 String[][] perLang = new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
133
134 for (String[] months: perLang) {
135 for(int m = 1; m < 13; m++){
136 monthFromNameMap.put(months[m - 1].toLowerCase(), m);
137 }
138 }
139
140 // special cases
141 monthFromNameMap.put("mar", 3);
142 monthFromNameMap.put("dec", 12);
143 monthFromNameMap.put("Februari", 2);
144 }
145
146
147 DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
148
149 private Map<String, Collection> collectionMap = new HashMap<>();
150
151
152 enum TypesName {
153 fieldUnit, holotype, isotype;
154
155 public SpecimenTypeDesignationStatus status(){
156 switch (this) {
157 case holotype:
158 return SpecimenTypeDesignationStatus.HOLOTYPE();
159 case isotype:
160 return SpecimenTypeDesignationStatus.ISOTYPE();
161 default:
162 return null;
163 }
164 }
165 }
166
167 private MarkerType markerTypeFossil = null;
168 private Rank rankUnrankedSupraGeneric = null;
169 private Rank familyIncertisSedis = null;
170 private AnnotationType annotationTypeCaveats = null;
171
172 private Reference bookVariedadesTradicionales = null;
173
174 private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
175 TaxonNode higherTaxonNode, boolean isFossil) {
176
177 String regNumber = getValue(record, REGISTRATIONNO_PK, false);
178 String regStr = getValue(record, REGISTRATION, true);
179 String titleCacheStr = getValue(record, FULLNAME, true);
180 String nameStr = getValue(record, NAMESTRING, true);
181 String authorStr = getValue(record, AUTHORSTRING, true);
182 String nomRefStr = getValue(record, LITSTRING, true);
183 String authorsSpelling = getValue(record, AUTHORSSPELLING, true);
184 String notesTxt = getValue(record, NOTESTXT, true);
185 String caveats = getValue(record, CAVEATS, true);
186 String fullSynSubstStr = getValue(record, FULLSYNSUBST, true);
187 String fullBasionymStr = getValue(record, FULLBASIONYM, true);
188 String basionymNameStr = getValue(record, FULLBASIONYM, true);
189 String synSubstStr = getValue(record, SYNSUBSTSTR, true);
190 String typeStr = getValue(record, TYPE, true);
191
192
193 String nomRefTitle = null;
194 String nomRefDetail;
195 String nomRefPupDate = null;
196 String nomRefIssue = null;
197 Partial pupDate = null;
198
199 boolean restoreOriginalReference = false;
200 boolean nameIsValid = true;
201
202 // preprocess nomRef: separate citation, reference detail, publishing date
203 if(!StringUtils.isEmpty(nomRefStr)){
204 nomRefStr = nomRefStr.trim();
205
206 // handle the special case which is hard to parse:
207 //
208 // Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita: 154. 1997.
209 if(nomRefStr.startsWith("Las variedades tradicionales de frutales ")){
210
211 if(bookVariedadesTradicionales == null){
212 bookVariedadesTradicionales = ReferenceFactory.newBook();
213 bookVariedadesTradicionales.setTitle("Las variedades tradicionales de frutales de la Cuenca del Río Segura. Catálogo Etnobotánico (1): Frutos secos, oleaginosos, frutales de hueso, almendros y frutales de pepita");
214 bookVariedadesTradicionales.setDatePublished(TimePeriod.NewInstance(1997));
215 getReferenceService().save(bookVariedadesTradicionales);
216 }
217 nomRefStr = nomRefStr.replaceAll("^.*?\\:.*?\\:", "Las variedades tradicionales:");
218 restoreOriginalReference = true;
219 }
220
221 Matcher m = nomRefTokenizeP.matcher(nomRefStr);
222 if(m.matches()){
223 nomRefTitle = m.group("title");
224 nomRefDetail = m.group("detail");
225 nomRefPupDate = m.group("date").trim();
226 nomRefIssue = m.group("issue");
227
228 pupDate = parseDate(regNumber, nomRefPupDate);
229 if (pupDate != null) {
230 nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
231 } else {
232 logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
233 }
234 } else {
235 nomRefTitle = nomRefStr;
236 }
237 }
238
239 BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
240
241 // always add the original strings of parsed data as annotation
242 taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
243 "\n - '" + LITSTRING + "': "+ nomRefStr +
244 "\n - '" + TYPE + "': " + typeStr +
245 "\n - '" + REGISTRATION + "': " + regStr
246 , AnnotationType.TECHNICAL(), Language.DEFAULT()));
247
248 if(restoreOriginalReference){
249 taxonName.setNomenclaturalReference(bookVariedadesTradicionales);
250 }
251 if(pupDate != null) {
252 taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
253 }
254 if(nomRefIssue != null) {
255 ((Reference)taxonName.getNomenclaturalReference()).setVolume(nomRefIssue);
256 }
257
258
259 if(!StringUtils.isEmpty(notesTxt)){
260 notesTxt = notesTxt.replace("Notes: ", "").trim();
261 taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
262 nameIsValid = false;
263
264 }
265 if(!StringUtils.isEmpty(caveats)){
266 caveats = caveats.replace("Caveats: ", "").trim();
267 taxonName.addAnnotation(Annotation.NewInstance(caveats, annotationTypeCaveats(), Language.DEFAULT()));
268 nameIsValid = false;
269 }
270
271 if(nameIsValid){
272 // Status is always considered valid if no notes and cavets are set
273 taxonName.addStatus(NomenclaturalStatus.NewInstance(NomenclaturalStatusType.VALID()));
274 }
275
276 getNameService().save(taxonName);
277
278 // Namerelations
279 if(!StringUtils.isEmpty(authorsSpelling)){
280 authorsSpelling = authorsSpelling.replaceFirst("Author's spelling:", "").replaceAll("\"", "").trim();
281
282 String[] authorSpellingTokens = StringUtils.split(authorsSpelling, " ");
283 String[] nameStrTokens = StringUtils.split(nameStr, " ");
284
285 ArrayUtils.reverse(authorSpellingTokens);
286 ArrayUtils.reverse(nameStrTokens);
287
288 for (int i = 0; i < nameStrTokens.length; i++){
289 if(i < authorSpellingTokens.length){
290 nameStrTokens[i] = authorSpellingTokens[i];
291 }
292 }
293 ArrayUtils.reverse(nameStrTokens);
294
295 String misspelledNameStr = StringUtils.join (nameStrTokens, ' ');
296 // build the fullnameString of the misspelled name
297 misspelledNameStr = taxonName.getTitleCache().replace(nameStr, misspelledNameStr);
298
299 TaxonNameBase misspelledName = (BotanicalName) nameParser.parseReferencedName(misspelledNameStr, NomenclaturalCode.ICNAFP, null);
300 misspelledName.addRelationshipToName(taxonName, NameRelationshipType.MISSPELLING(), null);
301 getNameService().save(misspelledName);
302 }
303
304 // Replaced Synonyms
305 if(!StringUtils.isEmpty(fullSynSubstStr)){
306 fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
307 BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
308 replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
309 getNameService().save(replacedSynonymName);
310 }
311
312 Reference sec = state.getConfig().getSecReference();
313 Taxon taxon = Taxon.NewInstance(taxonName, sec);
314
315 // Basionym
316 if(fullBasionymStr != null){
317 fullBasionymStr = fullBasionymStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
318 basionymNameStr = basionymNameStr.replaceAll("^\\w*:\\s", ""); // Strip off the leading 'Basionym: "
319 BotanicalName basionym = makeBotanicalName(state, regNumber, fullBasionymStr, basionymNameStr, null, null);
320 getNameService().save(basionym);
321 taxonName.addBasionym(basionym);
322
323 Synonym syn = Synonym.NewInstance(basionym, sec);
324 taxon.addSynonym(syn, SynonymRelationshipType.HOMOTYPIC_SYNONYM_OF());
325 getTaxonService().save(syn);
326 }
327
328 // Markers
329 if(isFossil){
330 taxon.addMarker(Marker.NewInstance(markerTypeFossil(), true));
331 }
332
333 // Types
334 if(!StringUtils.isEmpty(typeStr)){
335
336 if(taxonName.getRank().isSpecies() || taxonName.getRank().isLower(Rank.SPECIES())) {
337 makeSpecimenTypeData(typeStr, taxonName, regNumber, state);
338 } else {
339 makeNameTypeData(typeStr, taxonName, regNumber, state);
340 }
341 }
342
343 getTaxonService().save(taxon);
344
345 if(taxonName.getRank().equals(Rank.SPECIES()) || taxonName.getRank().isLower(Rank.SPECIES())){
346 // try to find the genus, it should have been imported already, Genera are coming first in the import file
347 Taxon genus = ((IAPTImportState)state).getGenusTaxonMap().get(taxonName.getGenusOrUninomial());
348 if(genus != null){
349 higherTaxonNode = genus.getTaxonNodes().iterator().next();
350 } else {
351 logger.info(csvReportLine(regNumber, "Parent genus not found for", nameStr));
352 }
353 }
354
355 if(higherTaxonNode != null){
356 higherTaxonNode.addChildTaxon(taxon, null, null);
357 getTaxonNodeService().save(higherTaxonNode);
358 }
359
360 if(taxonName.getRank().isGenus()){
361 ((IAPTImportState)state).getGenusTaxonMap().put(taxonName.getGenusOrUninomial(), taxon);
362 }
363
364 return taxon;
365 }
366
367 private void makeSpecimenTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
368
369 Matcher m = typeSpecimenSplitPattern.matcher(typeStr);
370
371 if(m.matches()){
372 String fieldUnitStr = m.group(TypesName.fieldUnit.name());
373 // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
374 FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
375 if(fieldUnit == null) {
376 // create a field unit with only a titleCache using the fieldUnitStr substring
377 logger.warn(csvReportLine(regNumber, "Type: fieldUnitStr can not be parsed", fieldUnitStr));
378 fieldUnit = FieldUnit.NewInstance();
379 fieldUnit.setTitleCache(fieldUnitStr, true);
380 getOccurrenceService().save(fieldUnit);
381 }
382 getOccurrenceService().save(fieldUnit);
383
384 // all others ..
385 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
386 addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
387
388 } else {
389 // create a field unit with only a titleCache using the full typeStr
390 FieldUnit fieldUnit = FieldUnit.NewInstance();
391 fieldUnit.setTitleCache(typeStr, true);
392 getOccurrenceService().save(fieldUnit);
393 logger.warn(csvReportLine(regNumber, "Type: field 'Type' can not be parsed", typeStr));
394 }
395 getNameService().save(taxonName);
396 }
397
398 private void makeNameTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
399
400 String nameStr = typeStr.replaceAll("^Type\\s?\\:\\s?", "");
401 if(nameStr.isEmpty()) {
402 return;
403 }
404
405 String basionymNameStr = null;
406 String noteStr = null;
407 String agentStr = null;
408
409 Matcher m;
410
411 if(typeStr.startsWith("not to be indicated")){
412 // Special case:
413 // Type: not to be indicated (Art. H.9.1. Tokyo Code); stated parent genera: Hechtia Klotzsch; Deuterocohnia Mez
414 // FIXME
415 m = typeNameSpecialSplitPattern.matcher(nameStr);
416 if(m.matches()){
417 nameStr = m.group("name");
418 noteStr = m.group("note");
419 agentStr = m.group("agent");
420 // TODO better import of agent?
421 if(agentStr != null){
422 noteStr = noteStr + ": " + agentStr;
423 }
424 }
425 } else {
426 // Generic case
427 m = typeNameBasionymPattern.matcher(nameStr);
428 if (m.find()) {
429 basionymNameStr = m.group("basionymName");
430 if (basionymNameStr != null) {
431 nameStr = nameStr.replace(m.group(0), "");
432 }
433 }
434
435 m = typeNameNotePattern.matcher(nameStr);
436 if (m.find()) {
437 noteStr = m.group(1);
438 if (noteStr != null) {
439 nameStr = nameStr.replace(m.group(0), "");
440 }
441 }
442 }
443
444 BotanicalName typeName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
445
446 if(typeName.isProtectedTitleCache() || typeName.getNomenclaturalReference() != null && typeName.getNomenclaturalReference().isProtectedTitleCache()) {
447 logger.warn(csvReportLine(regNumber, "NameType not parsable", typeStr, nameStr));
448 }
449
450 if(basionymNameStr != null){
451 BotanicalName basionymName = (BotanicalName) nameParser.parseFullName(nameStr, NomenclaturalCode.ICNAFP, null);
452 getNameService().save(basionymName);
453 typeName.addBasionym(basionymName);
454 }
455
456
457 NameTypeDesignation nameTypeDesignation = NameTypeDesignation.NewInstance();
458 nameTypeDesignation.setTypeName(typeName);
459 getNameService().save(typeName);
460
461 if(noteStr != null){
462 nameTypeDesignation.addAnnotation(Annotation.NewInstance(noteStr, AnnotationType.EDITORIAL(), Language.UNKNOWN_LANGUAGE()));
463 }
464 taxonName.addNameTypeDesignation(typeName, null, null, null, null, false);
465
466 }
467
468 /**
469 * Currently only parses the collector, fieldNumber and the collection date.
470 *
471 * @param fieldUnitStr
472 * @param regNumber
473 * @param state
474 * @return null if the fieldUnitStr could not be parsed
475 */
476 private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
477
478 FieldUnit fieldUnit = null;
479
480 Matcher m1 = collectorPattern.matcher(fieldUnitStr);
481 if(m1.matches()){
482
483 String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
484 String removal = m1.group(1);
485 if(collectorData == null){
486 collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
487 removal = m1.group(3);
488 }
489 if(collectorData == null){
490 return null;
491 }
492
493 // the fieldUnitStr is parsable
494 // remove all collectorData from the fieldUnitStr and use the rest as locality
495 String locality = fieldUnitStr.replace(removal, "");
496
497 String collectorStr = null;
498 String detailStr = null;
499 Partial date = null;
500 String fieldNumber = null;
501
502 Matcher m2 = collectionDataPattern.matcher(collectorData);
503 if(m2.matches()){
504 collectorStr = m2.group("collector");
505 detailStr = m2.group("detail");
506
507 // Try to make sense of the detailStr
508 if(detailStr != null){
509 detailStr = detailStr.trim();
510 // 1. try to parse as date
511 date = parseDate(regNumber, detailStr);
512 if(date == null){
513 // 2. try to parse as number
514 if(collectorsNumber.matcher(detailStr).matches()){
515 fieldNumber = detailStr;
516 }
517 }
518 }
519 if(date == null && fieldNumber == null){
520 // detailed parsing not possible, so need fo fallback
521 collectorStr = collectorData;
522 }
523 }
524
525 if(collectorStr == null) {
526 collectorStr = collectorData;
527 }
528
529 fieldUnit = FieldUnit.NewInstance();
530 GatheringEvent ge = GatheringEvent.NewInstance();
531 ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
532
533 TeamOrPersonBase agent = state.getAgentBase(collectorStr);
534 if(agent == null) {
535 agent = Person.NewTitledInstance(collectorStr);
536 getAgentService().save(agent);
537 state.putAgentBase(collectorStr, agent);
538 }
539 ge.setCollector(agent);
540
541 if(date != null){
542 ge.setGatheringDate(date);
543 }
544
545 getEventBaseService().save(ge);
546 fieldUnit.setGatheringEvent(ge);
547
548 if(fieldNumber != null) {
549 fieldUnit.setFieldNumber(fieldNumber);
550 }
551 getOccurrenceService().save(fieldUnit);
552
553 }
554
555 return fieldUnit;
556 }
557
558 private Partial parseDate(String regNumber, String dateStr) {
559
560 Partial pupDate = null;
561 boolean parseError = false;
562
563 String day = null;
564 String month = null;
565 String monthName = null;
566 String year = null;
567
568 for(Pattern p : datePatterns){
569 Matcher m2 = p.matcher(dateStr);
570 if(m2.matches()){
571 try {
572 year = m2.group("year");
573 } catch (IllegalArgumentException e){
574 // named capture group not found
575 }
576 try {
577 month = m2.group("month");
578 } catch (IllegalArgumentException e){
579 // named capture group not found
580 }
581
582 try {
583 monthName = m2.group("monthName");
584 month = monthFromName(monthName, regNumber);
585 if(month == null){
586 parseError = true;
587 }
588 } catch (IllegalArgumentException e){
589 // named capture group not found
590 }
591 try {
592 day = m2.group("day");
593 } catch (IllegalArgumentException e){
594 // named capture group not found
595 }
596
597 if(year != null){
598 if (year.length() == 2) {
599 // it is an abbreviated year from the 19** years
600 year = "19" + year;
601 }
602 break;
603 } else {
604 parseError = true;
605 }
606 }
607 }
608 if(year == null){
609 parseError = true;
610 }
611 List<DateTimeFieldType> types = new ArrayList<>();
612 List<Integer> values = new ArrayList<>();
613 if(!parseError) {
614 types.add(DateTimeFieldType.year());
615 values.add(Integer.parseInt(year));
616 if (month != null) {
617 types.add(DateTimeFieldType.monthOfYear());
618 values.add(Integer.parseInt(month));
619 }
620 if (day != null) {
621 types.add(DateTimeFieldType.dayOfMonth());
622 values.add(Integer.parseInt(day));
623 }
624 pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
625 }
626 return pupDate;
627 }
628
629 private String monthFromName(String monthName, String regNumber) {
630
631 Integer month = monthFromNameMap.get(monthName.toLowerCase());
632 if(month == null){
633 logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
634 return null;
635 } else {
636 return month.toString();
637 }
638 }
639
640
641 private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
642
643 if(StringUtils.isEmpty(typeStr)){
644 return;
645 }
646 typeStr = typeStr.trim().replaceAll("\\.$", "");
647
648 Collection collection = null;
649 DerivedUnit specimen = null;
650
651 List<DerivedUnit> specimens = new ArrayList<>();
652 if(multiple){
653 String[] tokens = typeStr.split("\\s?,\\s?");
654 for (String t : tokens) {
655 // command to list all complex parsabel types:
656 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
657 // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
658
659 if(!t.isEmpty()){
660 // trying to parse the string
661 specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
662 if(specimen != null){
663 specimens.add(specimen);
664 } else {
665 // parsing was not successful make simple specimen
666 specimens.add(makeSpecimenType(fieldUnit, t));
667 }
668 }
669 }
670 } else {
671 specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
672 if(specimen != null) {
673 specimens.add(specimen);
674 // remember current collection
675 collection = specimen.getCollection();
676 } else {
677 // parsing was not successful make simple specimen
678 specimens.add(makeSpecimenType(fieldUnit, typeStr));
679 }
680 }
681
682 for(DerivedUnit s : specimens){
683 taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
684 }
685 }
686
687 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
688 DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
689 facade.setTitleCache(titleCache.trim(), true);
690 specimen = facade.innerDerivedUnit();
691 return specimen;
692 }
693
694 /**
695 *
696 * @param fieldUnit
697 * @param typeName
698 * @param collection
699 * @param text
700 * @param regNumber
701 * @return
702 */
703 private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
704
705 DerivedUnit specimen = null;
706
707 String collectionCode = null;
708 String subCollectionStr = null;
709 String instituteStr = null;
710 String accessionNumber = null;
711
712 boolean unusualAccessionNumber = false;
713
714 text = text.trim();
715
716 // 1. For Isotypes often the accession number is noted alone if the
717 // preceeding entry has a collection code.
718 if(typeName .equals(TypesName.isotype) && collection != null){
719 Matcher m = accessionNumberOnlyPattern.matcher(text);
720 if(m.matches()){
721 try {
722 accessionNumber = m.group("accNumber");
723 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
724 } catch (IllegalArgumentException e){
725 // match group acc_number not found
726 }
727 }
728 }
729
730 //2. try it the 'normal' way
731 if(specimen == null) {
732 for (Pattern p : specimenTypePatterns) {
733 Matcher m = p.matcher(text);
734 if (m.matches()) {
735 // collection code is mandatory
736 try {
737 collectionCode = m.group("colCode");
738 } catch (IllegalArgumentException e){
739 // match group colCode not found
740 }
741 try {
742 subCollectionStr = m.group("subCollection");
743 } catch (IllegalArgumentException e){
744 // match group subCollection not found
745 }
746 try {
747 instituteStr = m.group("institute");
748 } catch (IllegalArgumentException e){
749 // match group col_name not found
750 }
751 try {
752 accessionNumber = m.group("accNumber");
753
754 // try to improve the accessionNumber
755 if(accessionNumber!= null) {
756 accessionNumber = accessionNumber.trim();
757 Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
758 String betterAccessionNumber = null;
759 if (m2.matches()) {
760 try {
761 betterAccessionNumber = m.group("accNumber");
762 } catch (IllegalArgumentException e) {
763 // match group acc_number not found
764 }
765 }
766 if (betterAccessionNumber != null) {
767 accessionNumber = betterAccessionNumber;
768 } else {
769 unusualAccessionNumber = true;
770 }
771 }
772
773 } catch (IllegalArgumentException e){
774 // match group acc_number not found
775 }
776
777 if(collectionCode == null && instituteStr == null){
778 logger.warn(csvReportLine(regNumber, "Type: neither 'collectionCode' nor 'institute' found in ", text));
779 continue;
780 }
781 collection = getCollection(collectionCode, instituteStr, subCollectionStr);
782 specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
783 break;
784 }
785 }
786 }
787 if(specimen == null) {
788 logger.warn(csvReportLine(regNumber, "Type: Could not parse specimen", typeName.name().toString(), text));
789 }
790 if(unusualAccessionNumber){
791 logger.warn(csvReportLine(regNumber, "Type: Unusual accession number", typeName.name().toString(), text, accessionNumber));
792 }
793 return specimen;
794 }
795
796 private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
797
798 DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
799 facade.setCollection(collection);
800 if(accessionNumber != null){
801 facade.setAccessionNumber(accessionNumber);
802 }
803 return facade.innerDerivedUnit();
804 }
805
806 private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
807 String authorStr, String nomRefTitle) {
808
809 BotanicalName taxonName;// cache field for the taxonName.titleCache
810 String taxonNameTitleCache = null;
811 Map<String, AnnotationType> nameAnnotations = new HashMap<>();
812
813 // TitleCache preprocessing
814 if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
815 nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
816 titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
817 if(authorStr != null) {
818 authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
819 }
820 }
821
822 // parse the full taxon name
823 if(!StringUtils.isEmpty(nomRefTitle)){
824 String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
825 String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
826 logger.debug(":::::" + taxonFullNameStr);
827 taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
828 } else {
829 taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
830 }
831
832 taxonNameTitleCache = taxonName.getTitleCache().trim();
833 if (taxonName.isProtectedTitleCache()) {
834 logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
835 } else {
836
837 boolean doRestoreTitleCacheStr = false;
838
839 // Check if titleCache and nameCache are plausible
840 String titleCacheCompareStr = titleCacheStr;
841 String nameCache = taxonName.getNameCache();
842 String nameCompareStr = nameStr;
843 if(taxonName.isBinomHybrid()){
844 titleCacheCompareStr = titleCacheCompareStr.replace(" x ", " ×");
845 nameCompareStr = nameCompareStr.replace(" x ", " ×");
846 }
847 if(taxonName.isMonomHybrid()){
848 titleCacheCompareStr = titleCacheCompareStr.replaceAll("^X ", "× ");
849 nameCompareStr = nameCompareStr.replace("^X ", "× ");
850 }
851 if(authorStr != null && authorStr.contains(" et ")){
852 titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
853 }
854 if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
855 logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
856 doRestoreTitleCacheStr = true;
857 }
858 if (!nameCache.trim().equals(nameCompareStr)) {
859 logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
860 }
861
862 // Author
863 //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
864 //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
865 // logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
866 // doRestoreTitleCacheStr = true;
867 //}
868
869 if(doRestoreTitleCacheStr){
870 taxonName.setTitleCache(titleCacheStr, true);
871 }
872
873 // deduplicate
874 replaceAuthorNamesAndNomRef(state, taxonName);
875 }
876
877 // Annotations
878 if(!nameAnnotations.isEmpty()){
879 for(String text : nameAnnotations.keySet()){
880 taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
881 }
882 getNameService().save(taxonName);
883 }
884 return taxonName;
885 }
886
887 /**
888 * @param state
889 * @return
890 */
891 private TaxonNode getClassificationRootNode(IAPTImportState state) {
892
893 // Classification classification = state.getClassification();
894 // if (classification == null){
895 // IAPTImportConfigurator config = state.getConfig();
896 // classification = Classification.NewInstance(state.getConfig().getClassificationName());
897 // classification.setUuid(config.getClassificationUuid());
898 // classification.setReference(config.getSecReference());
899 // classification = getClassificationService().find(state.getConfig().getClassificationUuid());
900 // }
901 TaxonNode rootNode = state.getRootNode();
902 if (rootNode == null){
903 rootNode = getTaxonNodeService().find(ROOT_UUID);
904 }
905 if (rootNode == null){
906 Classification classification = state.getClassification();
907 if (classification == null){
908 Reference sec = state.getSecReference();
909 String classificationName = state.getConfig().getClassificationName();
910 Language language = Language.DEFAULT();
911 classification = Classification.NewInstance(classificationName, sec, language);
912 state.setClassification(classification);
913 classification.setUuid(state.getConfig().getClassificationUuid());
914 classification.getRootNode().setUuid(ROOT_UUID);
915 getClassificationService().save(classification);
916 }
917 rootNode = classification.getRootNode();
918 state.setRootNode(rootNode);
919 }
920 return rootNode;
921 }
922
923 private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
924
925 Collection superCollection = null;
926 if(subCollectionStr != null){
927 superCollection = getCollection(collectionCode, instituteStr, null);
928 collectionCode = subCollectionStr;
929 instituteStr = null;
930 }
931
932 final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
933
934 Collection collection = collectionMap.get(key);
935
936 if(collection == null) {
937 collection = Collection.NewInstance();
938 collection.setCode(collectionCode);
939 if(instituteStr != null){
940 collection.setInstitute(Institution.NewNamedInstance(instituteStr));
941 }
942 if(superCollection != null){
943 collection.setSuperCollection(superCollection);
944 }
945 collectionMap.put(key, collection);
946 getCollectionService().save(collection);
947 }
948
949 return collection;
950 }
951
952
953 /**
954 * @param record
955 * @param originalKey
956 * @param doUnescapeHtmlEntities
957 * @return
958 */
959 private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
960 String value = record.get(originalKey);
961
962 value = fixCharacters(value);
963
964 if (! StringUtils.isBlank(value)) {
965 if (logger.isDebugEnabled()) {
966 logger.debug(originalKey + ": " + value);
967 }
968 value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
969 if(doUnescapeHtmlEntities){
970 value = StringEscapeUtils.unescapeHtml(value);
971 }
972 return value.trim();
973 }else{
974 return null;
975 }
976 }
977
978 /**
979 * Fixes broken characters.
980 * For details see
981 * http://dev.e-taxonomy.eu/redmine/issues/6035
982 *
983 * @param value
984 * @return
985 */
986 private String fixCharacters(String value) {
987
988 value = StringUtils.replace(value, "s$K", "š");
989 value = StringUtils.replace(value, "n$K", "ň");
990 value = StringUtils.replace(value, "e$K", "ě");
991 value = StringUtils.replace(value, "r$K", "ř");
992 value = StringUtils.replace(value, "c$K", "č");
993 value = StringUtils.replace(value, "z$K", "ž");
994 value = StringUtils.replace(value, "S>U$K", "Š");
995 value = StringUtils.replace(value, "C>U$K", "Č");
996 value = StringUtils.replace(value, "R>U$K", "Ř");
997 value = StringUtils.replace(value, "Z>U$K", "Ž");
998 value = StringUtils.replace(value, "g$K", "ǧ");
999 value = StringUtils.replace(value, "s$A", "ś");
1000 value = StringUtils.replace(value, "n$A", "ń");
1001 value = StringUtils.replace(value, "c$A", "ć");
1002 value = StringUtils.replace(value, "e$E", "ę");
1003 value = StringUtils.replace(value, "o$H", "õ");
1004 value = StringUtils.replace(value, "s$C", "ş");
1005 value = StringUtils.replace(value, "t$C", "ț");
1006 value = StringUtils.replace(value, "S>U$C", "Ş");
1007 value = StringUtils.replace(value, "a$O", "å");
1008 value = StringUtils.replace(value, "A>U$O", "Å");
1009 value = StringUtils.replace(value, "u$O", "ů");
1010 value = StringUtils.replace(value, "g$B", "ğ");
1011 value = StringUtils.replace(value, "g$B", "ĕ");
1012 value = StringUtils.replace(value, "a$B", "ă");
1013 value = StringUtils.replace(value, "l$/", "ł");
1014 value = StringUtils.replace(value, ">i", "ı");
1015 value = StringUtils.replace(value, "i$U", "ï");
1016 // Special-cases
1017 value = StringUtils.replace(value, "&yacute", "ý");
1018 value = StringUtils.replace(value, ">L", "Ł"); // corrected rule
1019 value = StringUtils.replace(value, "E>U$D", "З");
1020 value = StringUtils.replace(value, "S>U$E", "Ş");
1021 value = StringUtils.replace(value, "s$E", "ş");
1022
1023 value = StringUtils.replace(value, "c$k", "č");
1024 value = StringUtils.replace(value, " U$K", " Š");
1025
1026 value = StringUtils.replace(value, "B.O>U>!", "Ø");
1027 value = StringUtils.replace(value, "S$K", "Ŝ");
1028 value = StringUtils.replace(value, "§B>i", "ğ");
1029
1030
1031 return value;
1032 }
1033
1034
1035 /**
1036 * Stores taxa records in DB
1037 */
1038 @Override
1039 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
1040
1041 String lineNumber = "L#" + state.getCurrentLine() + ": ";
1042 logger.setLevel(Level.DEBUG);
1043 HashMap<String, String> record = state.getOriginalRecord();
1044 logger.debug(lineNumber + record.toString());
1045
1046 Set<String> keys = record.keySet();
1047 for (String key: keys) {
1048 if (! expectedKeys.contains(key)){
1049 logger.warn(lineNumber + "Unexpected Key: " + key);
1050 }
1051 }
1052
1053 String reg_id = record.get(REGISTRATIONNO_PK);
1054
1055 //higherTaxon
1056 String higherTaxaString = record.get(HIGHERTAXON);
1057 boolean isFossil = false;
1058 if(higherTaxaString.startsWith("FOSSIL ")){
1059 higherTaxaString = higherTaxaString.replace("FOSSIL ", "");
1060 isFossil = true;
1061 }
1062 TaxonNode higherTaxon = getHigherTaxon(higherTaxaString, (IAPTImportState)state);
1063
1064 //Taxon
1065 Taxon taxon = makeTaxon(record, state, higherTaxon, isFossil);
1066 if (taxon == null){
1067 logger.warn(lineNumber + "taxon could not be created and is null");
1068 return;
1069 }
1070 ((IAPTImportState)state).setCurrentTaxon(taxon);
1071
1072
1073 logger.info("#of imported Genera: " + ((IAPTImportState) state).getGenusTaxonMap().size());
1074 return;
1075 }
1076
1077 private TaxonNode getHigherTaxon(String higherTaxaString, IAPTImportState state) {
1078 String[] higherTaxaNames = higherTaxaString.toLowerCase().replaceAll("[\\[\\]]", "").split(":");
1079 TaxonNode higherTaxonNode = null;
1080
1081 ITaxonTreeNode rootNode = getClassificationRootNode(state);
1082 for (String htn : higherTaxaNames) {
1083 htn = StringUtils.capitalize(htn.trim());
1084 Taxon higherTaxon = state.getHigherTaxon(htn);
1085 if (higherTaxon != null){
1086 higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
1087 }else{
1088 BotanicalName name = makeHigherTaxonName(state, htn);
1089 Reference sec = state.getSecReference();
1090 higherTaxon = Taxon.NewInstance(name, sec);
1091 getTaxonService().save(higherTaxon);
1092 higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
1093 state.putHigherTaxon(htn, higherTaxon);
1094 getClassificationService().saveTreeNode(higherTaxonNode);
1095 }
1096 rootNode = higherTaxonNode;
1097 }
1098 return higherTaxonNode;
1099 }
1100
1101 private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
1102
1103 Rank rank = guessRank(name);
1104
1105 BotanicalName taxonName = BotanicalName.NewInstance(rank);
1106 taxonName.addSource(makeOriginalSource(state));
1107 taxonName.setGenusOrUninomial(StringUtils.capitalize(name));
1108 return taxonName;
1109 }
1110
1111 private Rank guessRank(String name) {
1112
1113 // normalize
1114 name = name.replaceAll("\\(.*\\)", "").trim();
1115
1116 if(name.matches("^Plantae$|^Fungi$")){
1117 return Rank.KINGDOM();
1118 } else if(name.matches("^Incertae sedis$|^No group assigned$")){
1119 return rankFamilyIncertisSedis();
1120 } else if(name.matches(".*phyta$|.*mycota$")){
1121 return Rank.PHYLUM();
1122 } else if(name.matches(".*phytina$|.*mycotina$")){
1123 return Rank.SUBPHYLUM();
1124 } else if(name.matches("Gymnospermae$|.*ones$")){ // Monocotyledones, Dicotyledones
1125 return rankUnrankedSupraGeneric();
1126 } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$|^Musci$|^Hepaticae$")){
1127 return Rank.CLASS();
1128 } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
1129 return Rank.SUBCLASS();
1130 } else if(name.matches(".*ales$")){
1131 return Rank.ORDER();
1132 } else if(name.matches(".*ineae$")){
1133 return Rank.SUBORDER();
1134 } else if(name.matches(".*aceae$")){
1135 return Rank.FAMILY();
1136 } else if(name.matches(".*oideae$")){
1137 return Rank.SUBFAMILY();
1138 } else
1139 // if(name.matches(".*eae$")){
1140 // return Rank.TRIBE();
1141 // } else
1142 if(name.matches(".*inae$")){
1143 return Rank.SUBTRIBE();
1144 } else if(name.matches(".*ae$")){
1145 return Rank.FAMILY();
1146 }
1147 return Rank.UNKNOWN_RANK();
1148 }
1149
1150 private Rank rankUnrankedSupraGeneric() {
1151
1152 if(rankUnrankedSupraGeneric == null){
1153 rankUnrankedSupraGeneric = Rank.NewInstance(RankClass.Suprageneric, "Unranked supra generic", " ", " ");
1154 getTermService().save(rankUnrankedSupraGeneric);
1155 }
1156 return rankUnrankedSupraGeneric;
1157 }
1158
1159 private Rank rankFamilyIncertisSedis() {
1160
1161 if(familyIncertisSedis == null){
1162 familyIncertisSedis = Rank.NewInstance(RankClass.Suprageneric, "Family incertis sedis", " ", " ");
1163 getTermService().save(familyIncertisSedis);
1164 }
1165 return familyIncertisSedis;
1166 }
1167
1168 private AnnotationType annotationTypeCaveats(){
1169 if(annotationTypeCaveats == null){
1170 annotationTypeCaveats = AnnotationType.NewInstance("Caveats", "Caveats", "");
1171 getTermService().save(annotationTypeCaveats);
1172 }
1173 return annotationTypeCaveats;
1174 }
1175
1176
1177 /**
1178 * @param state
1179 * @return
1180 */
1181 private IdentifiableSource makeOriginalSource(IAPTImportState state) {
1182 return IdentifiableSource.NewDataImportInstance("line: " + state.getCurrentLine(), null, state.getConfig().getSourceReference());
1183 }
1184
1185
1186 private Reference makeReference(IAPTImportState state, UUID uuidRef) {
1187 Reference ref = state.getReference(uuidRef);
1188 if (ref == null){
1189 ref = getReferenceService().find(uuidRef);
1190 state.putReference(uuidRef, ref);
1191 }
1192 return ref;
1193 }
1194
1195 private MarkerType markerTypeFossil(){
1196 if(this.markerTypeFossil == null){
1197 markerTypeFossil = MarkerType.NewInstance("isFossilTaxon", "isFossil", null);
1198 getTermService().save(this.markerTypeFossil);
1199 }
1200 return markerTypeFossil;
1201 }
1202
1203 private String csvReportLine(String regId, String message, String ... fields){
1204 StringBuilder out = new StringBuilder("regID#");
1205 out.append(regId).append(",\"").append(message).append('"');
1206
1207 for(String f : fields){
1208 out.append(",\"").append(f).append('"');
1209 }
1210 return out.toString();
1211 }
1212
1213
1214 }