cleanup
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / dwca / in / DwcTaxonStreamItem2CdmTaxonConverter.java
1 /**
2 * Copyright (C) 2009 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.io.dwca.in;
10
11 import java.net.URI;
12 import java.util.ArrayList;
13 import java.util.HashSet;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.UUID;
18
19 import org.apache.commons.lang.StringUtils;
20 import org.apache.log4j.Logger;
21
22 import com.ibm.lsid.MalformedLSIDException;
23
24 import eu.etaxonomy.cdm.common.CdmUtils;
25 import eu.etaxonomy.cdm.io.common.mapping.UndefinedTransformerMethodException;
26 import eu.etaxonomy.cdm.io.stream.IPartitionableConverter;
27 import eu.etaxonomy.cdm.io.stream.IReader;
28 import eu.etaxonomy.cdm.io.stream.ItemFilter;
29 import eu.etaxonomy.cdm.io.stream.ListReader;
30 import eu.etaxonomy.cdm.io.stream.MappedCdmBase;
31 import eu.etaxonomy.cdm.io.stream.PartitionableConverterBase;
32 import eu.etaxonomy.cdm.io.stream.StreamImportBase;
33 import eu.etaxonomy.cdm.io.stream.StreamImportStateBase;
34 import eu.etaxonomy.cdm.io.stream.StreamItem;
35 import eu.etaxonomy.cdm.io.stream.terms.TermUri;
36 import eu.etaxonomy.cdm.model.common.Annotation;
37 import eu.etaxonomy.cdm.model.common.CdmBase;
38 import eu.etaxonomy.cdm.model.common.Extension;
39 import eu.etaxonomy.cdm.model.common.ExtensionType;
40 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
41 import eu.etaxonomy.cdm.model.common.Identifier;
42 import eu.etaxonomy.cdm.model.common.LSID;
43 import eu.etaxonomy.cdm.model.common.Language;
44 import eu.etaxonomy.cdm.model.common.Marker;
45 import eu.etaxonomy.cdm.model.common.MarkerType;
46 import eu.etaxonomy.cdm.model.description.CommonTaxonName;
47 import eu.etaxonomy.cdm.model.description.Distribution;
48 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
49 import eu.etaxonomy.cdm.model.description.TaxonDescription;
50 import eu.etaxonomy.cdm.model.location.NamedArea;
51 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
52 import eu.etaxonomy.cdm.model.name.Rank;
53 import eu.etaxonomy.cdm.model.name.TaxonName;
54 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
55 import eu.etaxonomy.cdm.model.reference.Reference;
56 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
57 import eu.etaxonomy.cdm.model.taxon.Classification;
58 import eu.etaxonomy.cdm.model.taxon.Synonym;
59 import eu.etaxonomy.cdm.model.taxon.Taxon;
60 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
61 import eu.etaxonomy.cdm.model.term.DefinedTerm;
62 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
63 import eu.etaxonomy.cdm.strategy.exceptions.StringNotParsableException;
64 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
65 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
66
67 /**
68 * @author a.mueller
69 * @since 22.11.2011
70 */
71 public class DwcTaxonStreamItem2CdmTaxonConverter<CONFIG extends DwcaDataImportConfiguratorBase, STATE extends StreamImportStateBase<CONFIG, StreamImportBase>>
72 extends PartitionableConverterBase<CONFIG, STATE>
73 implements IPartitionableConverter<StreamItem, IReader<CdmBase>, String>, ItemFilter<StreamItem> {
74
75 private static final Logger logger = Logger.getLogger(DwcTaxonStreamItem2CdmTaxonConverter.class);
76
77 //if this converter is used as filter we may not want to delete item parts during evaluation
78 boolean isFilterOnly = false;
79
80 private static final String ID = "id";
81 // temporary key for the case that no dataset information is supplied, TODO use something better
82 public static final String NO_DATASET = "no_dataset_jli773oebhjklw";
83
84 private final NonViralNameParserImpl parser = NonViralNameParserImpl.NewInstance();
85
86 public DwcTaxonStreamItem2CdmTaxonConverter(STATE state) {
87 super(state);
88 }
89
90 public DwcTaxonStreamItem2CdmTaxonConverter(STATE state, boolean isFilter) {
91 super(state);
92 this.isFilterOnly = isFilter;
93 }
94
95 @Override
96 public boolean toBeRemovedFromStream(StreamItem item) {
97 if (!config.isDoSplitRelationshipImport()){
98 return false;
99 }else{
100 if (isSynonym(item)){
101 return ! this.config.isDoSynonymRelationships();
102 }else{
103 NomenclaturalCode nomCode = getNomCode(item);
104 Rank rank = getRank(item, nomCode);
105 boolean isHigherRank = rank == null || rank.isHigher(Rank.SPECIES());
106 if (isHigherRank){
107 return ! config.isDoHigherRankRelationships();
108 }else{
109 return ! config.isDoLowerRankRelationships();
110 }
111 }
112 }
113 }
114
115 private boolean isSynonym(StreamItem item) {
116 TaxonBase<?> taxonBase = getTaxonBase(item);
117 return taxonBase instanceof Synonym;
118 }
119
120 @Override
121 public IReader<MappedCdmBase<? extends CdmBase>> map(StreamItem csvTaxonRecord){
122 List<MappedCdmBase<? extends CdmBase>> resultList = new ArrayList<>();
123
124 //TODO what if not transactional?
125 Reference sourceReference = state.getTransactionalSourceReference();
126 String sourceReferenceDetail = null;
127
128 //taxon
129 TaxonBase<?> taxonBase = getTaxonBase(csvTaxonRecord);
130 MappedCdmBase<TaxonBase<?>> mcb = new MappedCdmBase<>(csvTaxonRecord.term, csvTaxonRecord.get(ID), taxonBase);
131 resultList.add(mcb);
132
133 //original source
134 String id = csvTaxonRecord.get(ID);
135 IdentifiableSource source = taxonBase.addSource(OriginalSourceType.Import, id, "Taxon", sourceReference, sourceReferenceDetail);
136 MappedCdmBase<IdentifiableSource> mappedSource = new MappedCdmBase<>(csvTaxonRecord.get(ID), source);
137 resultList.add(mappedSource);
138 csvTaxonRecord.remove(ID);
139
140 //rank
141 NomenclaturalCode nomCode = getNomCode(csvTaxonRecord);
142 Rank rank = getRank(csvTaxonRecord, nomCode);
143
144 //name && name published in
145 TaxonName name = getScientificName(csvTaxonRecord, nomCode, rank, resultList, sourceReference);
146 taxonBase.setName(name);
147
148 //nameAccordingTo
149 MappedCdmBase<Reference> sec = getNameAccordingTo(csvTaxonRecord, resultList);
150
151 if (sec == null && state.getConfig().isUseSourceReferenceAsSec()){
152 sec = new MappedCdmBase<>(state.getTransactionalSourceReference());
153 }
154 if (sec != null){
155 taxonBase.setSec(sec.getCdmBase());
156 }
157
158 //classification
159 handleDataset(csvTaxonRecord, taxonBase, resultList, sourceReference, sourceReferenceDetail);
160
161 //NON core
162 //term="http://purl.org/dc/terms/identifier"
163 //currently only LSIDs or generic
164 handleIdentifier(csvTaxonRecord, taxonBase);
165
166 //TaxonRemarks
167 handleTaxonRemarks(csvTaxonRecord, taxonBase);
168
169 //TDWG_1
170 handleTdwgArea(csvTaxonRecord, taxonBase);
171
172 //VernecularName
173 handleCommonNames(csvTaxonRecord, taxonBase);
174
175 //External Sources, ID's and References
176 handleIdentifiableObjects(csvTaxonRecord, taxonBase);
177
178
179 // <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
180 // The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
181 // Fungi, Plantae, Protozoa, Viruses -->
182 // <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
183
184 // <!-- Phylum in which the taxon has been classified -->
185 // <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
186
187 // <!-- Class in which the taxon has been classified -->
188 // <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
189
190 // <!-- Order in which the taxon has been classified -->
191 // <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
192
193 // <!-- Family in which the taxon has been classified -->
194 // <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
195
196 // <!-- Genus in which the taxon has been classified -->
197 // <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
198
199 // <!-- Subgenus in which the taxon has been classified -->
200 // <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
201 // <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
202
203 // <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
204 // <!-- Infraspecific epithet -->
205
206 // <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
207 // <!-- Authorship -->
208
209 // <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
210 // ==> see scientific name
211 //
212 // <!-- Acceptance status published in -->
213 // <field index='20' term='http://purl.org/dc/terms/source'/>
214 // <!-- Reference in which the scientific name was first published -->
215 // <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
216 // <!-- Taxon scrutinized by -->
217 // <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/> 
218 // <!-- Scrutiny date -->
219 // <field index='23' term='http://purl.org/dc/terms/modified'/>
220 // <!-- Additional data for the taxon -->
221 // <field index='24' term='http://purl.org/dc/terms/description'/>
222 // </core>
223
224 handleModified(csvTaxonRecord, taxonBase);
225
226 handleIsExtinct(csvTaxonRecord, taxonBase);
227
228
229
230 return new ListReader<>(resultList);
231 }
232
233
234
235 /**
236 * @param csvTaxonRecord
237 * @param taxonBase
238 */
239 private void handleIsExtinct(StreamItem item, TaxonBase<?> taxonBase) {
240 String isExtinctStr = item.get(TermUri.GBIF_IS_EXTINCT);
241 if (isBlank(isExtinctStr)){
242 return;
243 }
244 Boolean isExtinct = getBoolean(isExtinctStr, item);
245 if (isExtinct != null){
246 try {
247 UUID isExtinctUuid = state.getTransformer().getMarkerTypeUuid("isExtinct");
248 MarkerType markerType = state.getCurrentIO().getMarkerType(state, isExtinctUuid, "extinct", "extinct", "extinct");
249 Marker.NewInstance(taxonBase, isExtinct, markerType);
250
251 } catch (UndefinedTransformerMethodException e) {
252 String message = "GetMarkerType not available for import. This should not happen. Please conntact developer";
253 fireWarningEvent(message, item.getLocation(), 8);
254 }
255 }
256
257 }
258
259 /**
260 * @param item
261 * @param isExtinctStr
262 * @return
263 */
264 private Boolean getBoolean(String booleanStr, StreamItem item) {
265 try {
266 return Boolean.valueOf(booleanStr);
267 } catch (Exception e) {
268 String message = "Boolean value could not be parsed";
269 fireWarningEvent(message, item, 4);
270 return null;
271 }
272 }
273
274
275
276 /**
277 * @param csvTaxonRecord
278 * @param taxonBase
279 */
280 private void handleModified(StreamItem item, TaxonBase<?> taxonBase) {
281 String modifiedStr = item.get(TermUri.DC_MODIFIED);
282 if (isBlank(modifiedStr)){
283 return;
284 }
285
286 try {
287 UUID modifiedUuid = state.getTransformer().getExtensionTypeUuid("modified");
288 ExtensionType extensionType = state.getCurrentIO().getExtensionType(state, modifiedUuid, "modified", "modified", "modified");
289 Extension.NewInstance(taxonBase, modifiedStr, extensionType);
290
291 } catch (UndefinedTransformerMethodException e) {
292 String message = "GetMarkerType not available for import. This should not happen. Please conntact developer";
293 fireWarningEvent(message, item.getLocation(), 8);
294 }
295
296
297 }
298
299 /**
300 * @param item
301 * @param taxonBase
302 */
303 private void handleIdentifiableObjects(StreamItem item,TaxonBase<?> taxonBase) {
304
305 String references = item.get(TermUri.DC_REFERENCES);
306
307 if (references == null || references == "") {
308 references = item.get(TermUri.DWC_NAME_PUBLISHED_IN_ID);//lorna temporary until Scratchpads move the reference to the correct place.
309 }
310
311 if (StringUtils.isNotBlank(references)){
312 URI uri = makeUriIfIs(references);
313 if (uri != null){
314 Extension.NewInstance(taxonBase, references, ExtensionType.URL());
315 }else{
316 String message = "Non-URI Dublin Core References not yet handled for taxa. References is: %s";
317 fireWarningEvent(String.format(message, references), item, 6);
318 }
319 }
320
321
322 //TODO: Finish properly
323 String id = item.get(TermUri.CDM_SOURCE_IDINSOURCE);
324 String idNamespace = item.get(TermUri.CDM_SOURCE_IDNAMESPACE);
325 String reference = item.get(TermUri.CDM_SOURCE_REFERENCE);
326 if(StringUtils.isNotBlank(id) && StringUtils.isNotBlank(idNamespace) && StringUtils.isNotBlank(reference)){
327 Reference ref = ReferenceFactory.newGeneric();
328 ref.setTitle(reference);
329 Taxon taxon = (Taxon) taxonBase;
330 taxon.addSource(OriginalSourceType.Import, id, idNamespace, ref, null);
331 }
332
333 }
334
335
336 /**
337 * If str is an uri it returns is as an {@link URI}. If not it returns <code>null</code>.
338 * @param str
339 * @return the URI.
340 */
341 private URI makeUriIfIs(String str) {
342 if (! str.startsWith("http:")){
343 return null;
344 }else{
345 try {
346 URI uri = URI.create(str);
347 return uri;
348 } catch (Exception e) {
349 return null;
350 }
351 }
352
353 }
354
355
356 /**
357 * @param item
358 * @param taxonBase
359 */
360 private void handleCommonNames(StreamItem item,TaxonBase<?> taxonBase) {
361 //TODO: handle comma separated values
362 String commonName = item.get(TermUri.DWC_VERNACULAR_NAME);
363 if (StringUtils.isNotBlank(commonName)){
364
365 Language language = getLanguage(item);
366 CommonTaxonName commonTaxonName = CommonTaxonName.NewInstance(commonName, language);
367 if(taxonBase instanceof Taxon){
368 Taxon taxon = (Taxon) taxonBase;
369 TaxonDescription taxonDescription = getTaxonDescription(taxon, false);
370 taxonDescription.addElement(commonTaxonName);
371 logger.info("Common name " + commonName + " added to " + taxon.getTitleCache());
372 }
373 }
374 }
375
376
377
378 /**
379 * @param csvTaxonRecord
380 * @param taxonBase
381 */
382 private void handleTdwgArea(StreamItem item, TaxonBase<?> taxonBase) {
383 String tdwg_area = item.get(TermUri.DWC_COUNTRY_CODE);
384 if (tdwg_area != null){
385 if(taxonBase instanceof Synonym){
386 Synonym synonym = CdmBase.deproxy(taxonBase, Synonym.class);
387 Taxon acceptedTaxon = synonym.getAcceptedTaxon();
388 if (acceptedTaxon != null){
389 TaxonDescription td = getTaxonDescription(acceptedTaxon, false);
390 NamedArea area = NamedArea.getAreaByTdwgAbbreviation(tdwg_area);
391
392 if (area == null){
393 area = NamedArea.getAreaByTdwgLabel(tdwg_area);
394 }
395 if (area != null){
396 Distribution distribution = Distribution.NewInstance(area, PresenceAbsenceTerm.PRESENT());
397 td.addElement(distribution);
398 }
399 }
400 }
401 if(!(taxonBase instanceof Synonym)){
402 Taxon taxon = CdmBase.deproxy(taxonBase, Taxon.class);
403 TaxonDescription td = getTaxonDescription(taxon, false);
404 NamedArea area = NamedArea.getAreaByTdwgAbbreviation(tdwg_area);
405
406 if (area == null){
407 area = NamedArea.getAreaByTdwgLabel(tdwg_area);
408 }
409 if (area != null){
410 Distribution distribution = Distribution.NewInstance(area, PresenceAbsenceTerm.PRESENT());
411 td.addElement(distribution);
412 }
413 }
414 }
415 }
416
417
418 /**
419 * @param item
420 * @param taxonBase
421 */
422 private void handleTaxonRemarks(StreamItem item,TaxonBase<?> taxonBase) {
423 String comment = item.get(TermUri.DWC_TAXON_REMARKS);
424 Language language = getLanguage(item);
425 if(StringUtils.isNotBlank(comment)){
426 Annotation annotation = Annotation.NewInstance(comment, language);
427 taxonBase.addAnnotation(annotation);
428 }else{
429 // String message = "Comment is empty or some error appeared while saving: %s";
430 //// message = String.format(message);
431 // fireWarningEvent(message, item, 1);
432 }
433 }
434
435
436 //TODO handle non LSIDs
437 //TODO handle LSIDs for names
438 private void handleIdentifier(StreamItem csvTaxonRecord, TaxonBase<?> taxonBase) {
439 String identifier = csvTaxonRecord.get(TermUri.DC_IDENTIFIER);
440 if (StringUtils.isNotBlank(identifier)){
441 if (identifier.trim().startsWith("urn:lsid")){
442 try {
443 LSID lsid = new LSID(identifier);
444 taxonBase.setLsid(lsid);
445 } catch (MalformedLSIDException e) {
446 String message = "LSID is malformed and can't be handled as LSID: %s";
447 message = String.format(message, identifier);
448 fireWarningEvent(message, csvTaxonRecord, 4);
449 Identifier.NewInstance(taxonBase, identifier, DefinedTermBase.getTermByClassAndUUID(DefinedTerm.class, DefinedTerm.uuidLsid));
450 }
451 }else{
452 Identifier.NewInstance(taxonBase, identifier, null);
453 String message = "Identifier type not recognized. Create generic identifier: %s";
454 message = String.format(message, identifier);
455 fireWarningEvent(message, csvTaxonRecord, 1);
456 }
457 }
458
459 }
460
461
462 private void handleDataset(StreamItem item, TaxonBase<?> taxonBase,
463 List<MappedCdmBase<? extends CdmBase>> resultList,
464 Reference sourceReference,
465 String sourceReferecenDetail) {
466
467 TermUri idTerm = TermUri.DWC_DATASET_ID;
468 TermUri strTerm = TermUri.DWC_DATASET_NAME;
469
470 if (config.isDatasetsAsClassifications()){
471 String datasetId = CdmUtils.Nz(item.get(idTerm)).trim();
472 String datasetName = CdmUtils.Nz(item.get(strTerm)).trim();
473 if (CdmUtils.areBlank(datasetId, datasetName) ){
474 datasetId = NO_DATASET;
475 }
476
477 //check id
478 boolean classificationExists = state.exists(idTerm.toString() , datasetId, Classification.class);
479
480 //check name
481 if (!classificationExists){
482 classificationExists = state.exists(strTerm.toString() , datasetName, Classification.class);
483 }
484
485 //if not exists, create new
486 if (! classificationExists){
487 String classificationName = StringUtils.isBlank(datasetName)? datasetId : datasetName;
488 if (classificationName.equals(NO_DATASET)){
489 classificationName = config.getClassificationName();
490 //classificationName = "Classification (no name)"; //TODO define by config or zipfile or metadata
491 }
492
493 String classificationId = StringUtils.isBlank(datasetId)? datasetName : datasetId;
494 Classification classification = Classification.NewInstance(classificationName);
495 //source
496 IdentifiableSource source = classification.addSource(OriginalSourceType.Import, classificationId, "Dataset", sourceReference, sourceReferecenDetail);
497 //add to result
498 resultList.add(new MappedCdmBase<>(idTerm, datasetId, classification));
499 resultList.add(new MappedCdmBase<>(strTerm, datasetName, classification));
500 resultList.add(new MappedCdmBase<>(source));
501 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
502 state.putMapping(idTerm.toString(), classificationId, classification);
503 state.putMapping(strTerm.toString(), classificationName, classification);
504 }
505 }else if (config.isDatasetsAsSecundumReference() || config.isDatasetsAsOriginalSource()){
506 MappedCdmBase<Reference> mappedCitation = getReference(item, resultList, idTerm, strTerm, true);
507 if (mappedCitation != null){
508 Reference ref = mappedCitation.getCdmBase();
509 if (config.isDatasetsAsSecundumReference()){
510 //dataset as secundum reference
511 taxonBase.setSec(ref);
512 }else{
513 //dataset as original source
514 taxonBase.addSource(OriginalSourceType.Import, null, null, ref, null);
515 }
516 }
517 }else{
518 String message = "DatasetUse type not yet implemented. Can't import dataset information.";
519 fireWarningEvent(message, item, 4);
520 }
521
522 //remove to later check if all attributes were used
523 removeItemInfo(item, idTerm);
524 removeItemInfo(item, strTerm);
525 }
526
527
528 @Override
529 public String getSourceId(StreamItem item) {
530 String id = item.get(ID);
531 return id;
532 }
533
534 private MappedCdmBase<Reference> getNameAccordingTo(StreamItem item, List<MappedCdmBase<? extends CdmBase>> resultList) {
535 if (config.isDatasetsAsSecundumReference()){
536 //TODO store nameAccordingTo info some where else or let the user define where to store it.
537 return null;
538 }else{
539 TermUri idTerm = TermUri.DWC_NAME_ACCORDING_TO_ID;
540 TermUri strTerm = TermUri.DWC_NAME_ACCORDING_TO;
541 MappedCdmBase<Reference> secRef = getReference(item, resultList, idTerm, strTerm, false);
542 return secRef;
543 }
544 }
545
546 private NomenclaturalCode getNomCode(StreamItem item) {
547 String strNomCode = getValue(item, TermUri.DWC_NOMENCLATURAL_CODE);
548 NomenclaturalCode nomCode = null;
549 // by Nomcenclatural Code
550 if (strNomCode != null){
551 nomCode = NomenclaturalCode.fromString(strNomCode);
552 if (nomCode == null){
553 String message = "NomCode '%s' not recognized";
554 message = String.format(message, strNomCode);
555 fireWarningEvent(message, item, 4);
556 }else{
557 return nomCode;
558 }
559 }
560 // by Kingdom
561 String strKingdom = getValue(item, TermUri.DWC_KINGDOM);
562 if (strKingdom != null){
563 if (strKingdom.equalsIgnoreCase("Plantae")){
564 nomCode = NomenclaturalCode.ICNAFP;
565 }else if (strKingdom.equalsIgnoreCase("Fungi")){
566 nomCode = NomenclaturalCode.ICNAFP;
567 }else if (strKingdom.equalsIgnoreCase("Animalia")){
568 nomCode = NomenclaturalCode.ICZN;
569 }else if (strKingdom.equalsIgnoreCase("Protozoa")){
570 nomCode = NomenclaturalCode.ICZN;
571 }
572 }
573
574 //TODO further kingdoms
575 if (nomCode == null){
576 //TODO warning
577 if (config.getNomenclaturalCode() != null){
578 nomCode = config.getNomenclaturalCode();
579 }
580 }
581 return nomCode;
582 }
583
584
585 private TaxonName getScientificName(StreamItem item, NomenclaturalCode nomCode, Rank rank, List<MappedCdmBase<? extends CdmBase>> resultList, Reference sourceReference) {
586 TaxonName name = null;
587 String strScientificName = getValue(item, TermUri.DWC_SCIENTIFIC_NAME);
588 //Name
589 if (strScientificName != null){
590 name = (TaxonName)parser.parseFullName(strScientificName, nomCode, rank);
591 if ( rank != null && name != null && name.getRank() != null && ! rank.equals(name.getRank())){
592 if (config.isValidateRankConsistency()){
593 String message = "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
594 message = String.format(message, name.getRank().getTitleCache(), strScientificName, rank.getTitleCache());
595 fireWarningEvent(message, item, 4);
596 }
597 }
598 checkAuthorship(name, item);
599 resultList.add(new MappedCdmBase(TermUri.DWC_SCIENTIFIC_NAME, strScientificName, name));
600 }
601 //By ID
602 String strScientificNameId = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_ID);
603 if (strScientificNameId != null){
604 if (config.isScientificNameIdAsOriginalSourceId()){
605 if (name != null){
606 IdentifiableSource source = IdentifiableSource.NewInstance(OriginalSourceType.Import, strScientificNameId, TermUri.DWC_SCIENTIFIC_NAME_ID.toString(), sourceReference, null);
607 name.addSource(source);
608 }
609 }else{
610 String message = "ScientificNameId not yet implemented: '%s'";
611 message = String.format(message, strScientificNameId);
612 fireWarningEvent(message, item, 4);
613 }
614 }
615
616 //namePublishedIn
617 TermUri idTerm = TermUri.DWC_NAME_PUBLISHED_IN_ID;
618 TermUri strTerm = TermUri.DWC_NAME_PUBLISHED_IN;
619 MappedCdmBase<Reference> nomRef = getReference(item, resultList, idTerm, strTerm, false);
620
621 if (name != null){
622 if (nomRef != null){
623 name.setNomenclaturalReference(nomRef.getCdmBase()); //check if name already has a nomRef, shouldn't be the case usually
624 }
625 }else{
626 if (nomRef != null){
627 String message = "NamePublishedIn information available but no name exists";
628 fireWarningEvent(message, item, 4);
629 }
630 }
631 return name;
632 }
633
634
635 /**
636 * General method to handle references used for multiple attributes.
637 * @param item
638 * @param resultList
639 * @param idTerm
640 * @param strTerm
641 * @param idIsInternal
642 * @return
643 */
644 private MappedCdmBase<Reference> getReference(StreamItem item,
645 List<MappedCdmBase<? extends CdmBase>> resultList, TermUri idTerm,
646 TermUri strTerm, boolean idIsInternal) {
647 Reference newRef = null;
648 Reference sourceCitation = null;
649
650 MappedCdmBase<Reference> result = null;
651 if (exists(idTerm, item) || exists(strTerm, item)){
652 String refId = CdmUtils.Nz(item.get(idTerm)).trim();
653 String refStr = CdmUtils.Nz(item.get(strTerm)).trim();
654 if (StringUtils.isNotBlank(refId)){
655 List<Reference> references = state.get(idTerm.toString(), refId, Reference.class);
656 if (references.size() == 0){
657 if (! idIsInternal){
658 //references should already exist in store if not linking to external links like URLs
659 String message = "External namePublishedInIDs are not yet supported";
660 fireWarningEvent(message, item, 4);//set to DEBUG
661 }else{
662 newRef = ReferenceFactory.newGeneric(); //TODO handle other types if possible
663 newRef.addSource(OriginalSourceType.Import, refId, idTerm.toString(), sourceCitation, null);
664 MappedCdmBase<Reference> idResult = new MappedCdmBase<>(idTerm, refId, newRef);
665 resultList.add(idResult);
666 }
667 }else{
668 //TODO handle list.size > 1 , do we need a list here ?
669 result = new MappedCdmBase<Reference>(idTerm, refId , references.get(0));
670 }
671 }
672 if (result == null){
673 List<Reference> nomRefs = state.get(strTerm.toString(), refStr, Reference.class);
674 if (nomRefs.size() > 0){
675 //TODO handle list.size > 1 , do we need a list here ?
676 result = new MappedCdmBase<>(strTerm, refStr , nomRefs.get(0));
677 }else{
678 // new Reference
679 if (newRef == null){
680 newRef = ReferenceFactory.newGeneric(); //TODO handle other types if possible
681 }
682 newRef.setTitleCache(refStr, true);
683 //TODO distinguish available year, authorship, etc. if
684 result = new MappedCdmBase<>(strTerm, refStr, newRef);
685 resultList.add(result);
686 }
687 }
688 }
689 return result;
690 }
691
692
693 //TODO we may configure in configuration that scientific name never includes Authorship
694 private void checkAuthorship(TaxonName nameBase, StreamItem item) {
695 if (nameBase.isViral()){
696 return;
697 }
698 String strAuthors = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_AUTHORS);
699
700 if (! nameBase.isProtectedTitleCache()){
701 if (isBlank(nameBase.getAuthorshipCache())){
702 if (nameBase.isBotanical() || nameBase.isZoological()){
703 //TODO can't we also parse NonViralNames correctly ?
704 try {
705 parser.parseAuthors(nameBase, strAuthors);
706 } catch (StringNotParsableException e) {
707 nameBase.setAuthorshipCache(strAuthors);
708 }
709 }else{
710 nameBase.setAuthorshipCache(strAuthors);
711 }
712 //TODO throw warning (scientific name should always include authorship) by DwC definition
713 }
714 }
715
716 }
717
718
719 private Rank getRank(StreamItem csvTaxonRecord, NomenclaturalCode nomCode) {
720 boolean USE_UNKNOWN = true;
721 Rank rank = null;
722 String strRank = getValue(csvTaxonRecord,TermUri.DWC_TAXON_RANK);
723 String strVerbatimRank = getValue(csvTaxonRecord,TermUri.DWC_VERBATIM_TAXON_RANK);
724 if (strRank != null){
725 try {
726 rank = Rank.getRankByEnglishName(strRank, nomCode, USE_UNKNOWN);
727 if (rank.equals(Rank.UNKNOWN_RANK())){
728 rank = Rank.getRankByNameOrIdInVoc(strRank, USE_UNKNOWN);
729 if (rank.equals(Rank.UNKNOWN_RANK())){
730 String message = "Rank can not be defined for '%s'";
731 message = String.format(message, strRank);
732 fireWarningEvent(message, csvTaxonRecord, 4);
733 }
734 }
735 } catch (UnknownCdmTypeException e) {
736 //should not happen as USE_UNKNOWN is used
737 rank = Rank.UNKNOWN_RANK();
738 }
739 }
740 if ( (rank == null || rank.equals(Rank.UNKNOWN_RANK())) && strVerbatimRank != null){
741 try {
742 rank = Rank.getRankByNameOrIdInVoc(strVerbatimRank, USE_UNKNOWN);
743 if (rank.equals(Rank.UNKNOWN_RANK())){
744 String message = "Rank can not be defined for '%s'";
745 message = String.format(message, strVerbatimRank);
746 fireWarningEvent(message, csvTaxonRecord, 4);
747 }
748 } catch (UnknownCdmTypeException e) {
749 //should not happen as USE_UNKNOWN is used
750 rank = Rank.UNKNOWN_RANK();
751 }
752 }
753 return rank;
754 }
755
756
757 /**
758 * Creates an empty taxon object with a given status.
759 * <i>Empty</i> taxon means, without a defined name or sec.
760 * @param item
761 * @return
762 */
763 private TaxonBase<?> getTaxonBase(StreamItem item) {
764 TaxonName name = null;
765 Reference sec = null;
766 TaxonBase<?> result;
767 String taxStatus = item.get(TermUri.DWC_TAXONOMIC_STATUS);
768 String status = "";
769
770 if (taxStatus != null){
771 if (taxStatus.matches("accepted.*|valid")){
772 status += "A";
773 } else if (taxStatus.matches(".*synonym|invalid|not accepted")){ //not accepted comes from scratchpads
774 status += "S";
775 } else if (taxStatus.matches("misapplied.*")){
776 status += "M";
777 } else{
778 status += "?";
779 }
780 removeItemInfo(item, TermUri.DWC_TAXONOMIC_STATUS);
781 }
782 if (! CdmUtils.isBlank(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
783 // acceptedNameUsageId = id
784 if (getSourceId(item).equals(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
785 status += "A";
786 }else{
787 status += "S";
788 }
789 }
790 if (status.contains("A") || status.contains("M")){
791 result = Taxon.NewInstance(name, sec);
792 if (status.contains("S") && ! status.contains("M") ){
793 String message = "Ambigous taxon status (%s)";
794 message = String.format(message, status);
795 fireWarningEvent(message, item, 6);
796 }
797 } else if (status.contains("S")){
798 result = Synonym.NewInstance(name, sec);
799 } else{
800 result = Taxon.NewUnknownStatusInstance(name, sec);
801 }
802
803 return result;
804
805 }
806
807
808
809 /**
810 * @param item
811 * @return
812 */
813 private Language getLanguage(StreamItem item) {
814 String langItem = item.get(TermUri.DC_LANGUAGE);
815 Language language = null;
816
817 if(StringUtils.equalsIgnoreCase(langItem, "de")){
818 language = Language.GERMAN();
819 }else if(StringUtils.equalsIgnoreCase(langItem, "en")){
820 language = Language.ENGLISH();
821 }else{
822 language = Language.DEFAULT();
823 }
824 return language;
825 }
826
827 // ********************** PARTITIONABLE ****************************************/
828
829
830 @Override
831 protected void makeForeignKeysForItem(StreamItem item, Map<String, Set<String>> fkMap) {
832 String value;
833 String key;
834
835 //namePublishedIn
836 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN_ID.toString()))){
837 Set<String> keySet = getKeySet(key, fkMap);
838 keySet.add(value);
839 }
840 if (config.isDeduplicateNamePublishedIn()){
841 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN.toString()))){
842 Set<String> keySet = getKeySet(key, fkMap);
843 keySet.add(value);
844 }
845 }
846
847 //nameAccordingTo
848 if (! config.isDatasetsAsSecundumReference()){
849 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO_ID.toString()))){
850 Set<String> keySet = getKeySet(key, fkMap);
851 keySet.add(value);
852 }
853 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO.toString()))){
854 Set<String> keySet = getKeySet(key, fkMap);
855 keySet.add(value);
856 }
857 }
858
859 //dataset
860 if ( hasValue(value = item.get(key = TermUri.DWC_DATASET_ID.toString()))){
861 Set<String> keySet = getKeySet(key, fkMap);
862 keySet.add(value);
863 }
864 if ( hasValue(value = item.get(key = TermUri.DWC_DATASET_NAME.toString()))){
865 Set<String> keySet = getKeySet(key, fkMap);
866 keySet.add(value);
867 }
868
869 }
870
871
872 @Override
873 public Set<String> requiredSourceNamespaces() {
874 Set<String> result = new HashSet<>();
875 result.add(TermUri.DWC_NAME_PUBLISHED_IN_ID.toString());
876 result.add(TermUri.DWC_NAME_PUBLISHED_IN.toString());
877 if (!config.isDatasetsAsSecundumReference()){
878 result.add(TermUri.DWC_NAME_ACCORDING_TO_ID.toString());
879 result.add(TermUri.DWC_NAME_ACCORDING_TO.toString());
880 }
881 result.add(TermUri.DWC_DATASET_ID.toString());
882 result.add(TermUri.DWC_DATASET_NAME.toString());
883 return result;
884 }
885
886
887 /**
888 * @param item
889 * @param dwcTaxonomicStatus
890 */
891 private void removeItemInfo(StreamItem item, TermUri dwcTaxonomicStatus) {
892 if (!isFilterOnly){
893 item.remove(dwcTaxonomicStatus);
894 }
895 }
896
897
898 //** ***************************** TO STRING *********************************************/
899
900 @Override
901 public String toString(){
902 return this.getClass().getName();
903 }
904 }