3 * Copyright (C) 2009 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.io
.dwca
.in
;
12 import java
.util
.ArrayList
;
13 import java
.util
.HashSet
;
14 import java
.util
.List
;
18 import org
.apache
.commons
.lang
.StringUtils
;
19 import org
.apache
.log4j
.Logger
;
21 import com
.ibm
.lsid
.MalformedLSIDException
;
23 import eu
.etaxonomy
.cdm
.common
.CdmUtils
;
24 import eu
.etaxonomy
.cdm
.io
.dwca
.TermUri
;
25 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
26 import eu
.etaxonomy
.cdm
.model
.common
.IdentifiableSource
;
27 import eu
.etaxonomy
.cdm
.model
.common
.LSID
;
28 import eu
.etaxonomy
.cdm
.model
.common
.OriginalSourceType
;
29 import eu
.etaxonomy
.cdm
.model
.name
.BotanicalName
;
30 import eu
.etaxonomy
.cdm
.model
.name
.NomenclaturalCode
;
31 import eu
.etaxonomy
.cdm
.model
.name
.NonViralName
;
32 import eu
.etaxonomy
.cdm
.model
.name
.Rank
;
33 import eu
.etaxonomy
.cdm
.model
.name
.TaxonNameBase
;
34 import eu
.etaxonomy
.cdm
.model
.name
.ZoologicalName
;
35 import eu
.etaxonomy
.cdm
.model
.reference
.Reference
;
36 import eu
.etaxonomy
.cdm
.model
.reference
.ReferenceFactory
;
37 import eu
.etaxonomy
.cdm
.model
.taxon
.Classification
;
38 import eu
.etaxonomy
.cdm
.model
.taxon
.Synonym
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
40 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
41 import eu
.etaxonomy
.cdm
.strategy
.exceptions
.StringNotParsableException
;
42 import eu
.etaxonomy
.cdm
.strategy
.exceptions
.UnknownCdmTypeException
;
43 import eu
.etaxonomy
.cdm
.strategy
.parser
.NonViralNameParserImpl
;
50 public class DwcTaxonCsv2CdmTaxonConverter
extends PartitionableConverterBase
<DwcaImportState
> implements IPartitionableConverter
<CsvStreamItem
, IReader
<CdmBase
>, String
>{
51 @SuppressWarnings("unused")
52 private static Logger logger
= Logger
.getLogger(DwcTaxonCsv2CdmTaxonConverter
.class);
54 private static final String ID
= "id";
55 // temporary key for the case that no dataset information is supplied, TODO use something better
56 public static final String NO_DATASET
= "no_dataset_jli773oebhjklw";
58 private NonViralNameParserImpl parser
= NonViralNameParserImpl
.NewInstance();
63 public DwcTaxonCsv2CdmTaxonConverter(DwcaImportState state
) {
68 public IReader
<MappedCdmBase
> map(CsvStreamItem csvTaxonRecord
){
69 List
<MappedCdmBase
> resultList
= new ArrayList
<MappedCdmBase
>();
71 //TODO what if not transactional?
72 Reference
<?
> sourceReference
= state
.getTransactionalSourceReference();
73 String sourceReferenceDetail
= null;
76 TaxonBase
<?
> taxonBase
= getTaxonBase(csvTaxonRecord
);
77 MappedCdmBase mcb
= new MappedCdmBase(csvTaxonRecord
.term
, csvTaxonRecord
.get(ID
), taxonBase
);
81 String id
= csvTaxonRecord
.get(ID
);
82 IdentifiableSource source
= taxonBase
.addImportSource(id
, "Taxon", sourceReference
, sourceReferenceDetail
);
83 MappedCdmBase mappedSource
= new MappedCdmBase(csvTaxonRecord
.get(ID
), source
);
84 resultList
.add(mappedSource
);
85 csvTaxonRecord
.remove(ID
);
88 NomenclaturalCode nomCode
= getNomCode(csvTaxonRecord
);
89 Rank rank
= getRank(csvTaxonRecord
, nomCode
);
91 //name && name published in
92 TaxonNameBase
<?
,?
> name
= getScientificName(csvTaxonRecord
, nomCode
, rank
, resultList
, sourceReference
);
93 taxonBase
.setName(name
);
96 MappedCdmBase
<Reference
> sec
= getNameAccordingTo(csvTaxonRecord
, resultList
);
97 if (sec
== null && state
.getConfig().isUseSourceReferenceAsSec()){
98 sec
= new MappedCdmBase
<Reference
>(state
.getTransactionalSourceReference());
101 taxonBase
.setSec(sec
.getCdmBase());
105 handleDataset(csvTaxonRecord
, taxonBase
, resultList
, sourceReference
, sourceReferenceDetail
);
108 //term="http://purl.org/dc/terms/identifier"
109 //currently only LSIDs
110 handleIdentifier(csvTaxonRecord
, taxonBase
);
114 // <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
115 // The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
116 // Fungi, Plantae, Protozoa, Viruses -->
117 // <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
119 // <!-- Phylum in which the taxon has been classified -->
120 // <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
122 // <!-- Class in which the taxon has been classified -->
123 // <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
125 // <!-- Order in which the taxon has been classified -->
126 // <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
128 // <!-- Family in which the taxon has been classified -->
129 // <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
131 // <!-- Genus in which the taxon has been classified -->
132 // <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
134 // <!-- Subgenus in which the taxon has been classified -->
135 // <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
136 // <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
138 // <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
139 // <!-- Infraspecific epithet -->
141 // <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
142 // <!-- Authorship -->
144 // <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
145 // ==> see scientific name
147 // <!-- Acceptance status published in -->
148 // <field index='20' term='http://purl.org/dc/terms/source'/>
149 // <!-- Reference in which the scientific name was first published -->
150 // <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
151 // <!-- Taxon scrutinized by -->
152 // <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/>
153 // <!-- Scrutiny date -->
154 // <field index='23' term='http://purl.org/dc/terms/modified'/>
155 // <!-- Additional data for the taxon -->
156 // <field index='24' term='http://purl.org/dc/terms/description'/>
159 return new ListReader
<MappedCdmBase
>(resultList
);
164 //TODO handle non LSIDs
165 //TODO handle LSIDs for names
166 private void handleIdentifier(CsvStreamItem csvTaxonRecord
, TaxonBase
<?
> taxonBase
) {
167 String identifier
= csvTaxonRecord
.get(TermUri
.DC_IDENTIFIER
);
168 if (StringUtils
.isNotBlank(identifier
)){
169 if (identifier
.trim().startsWith("urn:lsid")){
171 LSID lsid
= new LSID(identifier
);
172 taxonBase
.setLsid(lsid
);
173 } catch (MalformedLSIDException e
) {
174 String message
= "LSID is malformed and can't be handled as LSID: %s";
175 message
= String
.format(message
, identifier
);
176 fireWarningEvent(message
, csvTaxonRecord
, 4);
179 String message
= "Identifier type not supported: %s";
180 message
= String
.format(message
, identifier
);
181 fireWarningEvent(message
, csvTaxonRecord
, 4);
188 private void handleDataset(CsvStreamItem item
, TaxonBase
<?
> taxonBase
, List
<MappedCdmBase
> resultList
, Reference
<?
> sourceReference
, String sourceReferecenDetail
) {
189 TermUri idTerm
= TermUri
.DWC_DATASET_ID
;
190 TermUri strTerm
= TermUri
.DWC_DATASET_NAME
;
192 if (config
.isDatasetsAsClassifications()){
193 String datasetId
= CdmUtils
.Nz(item
.get(idTerm
)).trim();
194 String datasetName
= CdmUtils
.Nz(item
.get(strTerm
)).trim();
195 if (CdmUtils
.areBlank(datasetId
, datasetName
) ){
196 datasetId
= NO_DATASET
;
200 boolean classificationExists
= state
.exists(idTerm
.toString() , datasetId
, Classification
.class);
203 if (!classificationExists
){
204 classificationExists
= state
.exists(strTerm
.toString() , datasetName
, Classification
.class);
207 //if not exists, create new
208 if (! classificationExists
){
209 String classificationName
= StringUtils
.isBlank(datasetName
)? datasetId
: datasetName
;
210 if (classificationName
.equals(NO_DATASET
)){
211 classificationName
= "Classification (no name)"; //TODO define by config or zipfile or metadata
214 String classificationId
= StringUtils
.isBlank(datasetId
)? datasetName
: datasetId
;
215 Classification classification
= Classification
.NewInstance(classificationName
);
217 IdentifiableSource source
= classification
.addSource(OriginalSourceType
.Lineage
, classificationId
, "Dataset", sourceReference
, sourceReferecenDetail
);
219 resultList
.add(new MappedCdmBase(idTerm
, datasetId
, classification
));
220 resultList
.add(new MappedCdmBase(strTerm
, datasetName
, classification
));
221 resultList
.add(new MappedCdmBase(source
));
222 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
223 state
.putMapping(idTerm
.toString(), classificationId
, classification
);
224 state
.putMapping(strTerm
.toString(), classificationName
, classification
);
226 }else if (config
.isDatasetsAsSecundumReference() || config
.isDatasetsAsOriginalSource()){
227 MappedCdmBase
<Reference
> mappedCitation
= getReference(item
, resultList
, idTerm
, strTerm
, true);
228 if (mappedCitation
!= null){
229 Reference
<?
> ref
= mappedCitation
.getCdmBase();
230 if (config
.isDatasetsAsSecundumReference()){
231 //dataset as secundum reference
232 taxonBase
.setSec(ref
);
234 //dataset as original source
235 taxonBase
.addSource(OriginalSourceType
.Lineage
, null, null, ref
, null);
239 String message
= "DatasetUse type not yet implemented. Can't import dataset information.";
240 fireWarningEvent(message
, item
, 4);
243 //remove to later check if all attributes were used
245 item
.remove(strTerm
);
251 public String
getSourceId(CsvStreamItem item
) {
252 String id
= item
.get(ID
);
256 private MappedCdmBase
<Reference
> getNameAccordingTo(CsvStreamItem item
, List
<MappedCdmBase
> resultList
) {
257 if (config
.isDatasetsAsSecundumReference()){
258 //TODO store nameAccordingTo info some where else or let the user define where to store it.
261 TermUri idTerm
= TermUri
.DWC_NAME_ACCORDING_TO_ID
;
262 TermUri strTerm
= TermUri
.DWC_NAME_ACCORDING_TO
;
263 MappedCdmBase
<Reference
> secRef
= getReference(item
, resultList
, idTerm
, strTerm
, false);
268 private NomenclaturalCode
getNomCode(CsvStreamItem item
) {
269 String strNomCode
= getValue(item
, TermUri
.DWC_NOMENCLATURAL_CODE
);
270 NomenclaturalCode nomCode
= null;
271 // by Nomcenclatural Code
272 if (strNomCode
!= null){
273 nomCode
= NomenclaturalCode
.fromString(strNomCode
);
274 if (nomCode
== null){
275 String message
= "NomCode '%s' not recognized";
276 message
= String
.format(message
, strNomCode
);
277 fireWarningEvent(message
, item
, 4);
283 String strKingdom
= getValue(item
, TermUri
.DWC_KINGDOM
);
284 if (strKingdom
!= null){
285 if (strKingdom
.equalsIgnoreCase("Plantae")){
286 nomCode
= NomenclaturalCode
.ICNAFP
;
287 }else if (strKingdom
.equalsIgnoreCase("Fungi")){
288 nomCode
= NomenclaturalCode
.ICNAFP
;
289 }else if (strKingdom
.equalsIgnoreCase("Animalia")){
290 nomCode
= NomenclaturalCode
.ICZN
;
291 }else if (strKingdom
.equalsIgnoreCase("Protozoa")){
292 nomCode
= NomenclaturalCode
.ICZN
;
296 //TODO further kingdoms
297 if (nomCode
== null){
299 if (config
.getNomenclaturalCode() != null){
300 nomCode
= config
.getNomenclaturalCode();
307 private TaxonNameBase
<?
,?
> getScientificName(CsvStreamItem item
, NomenclaturalCode nomCode
, Rank rank
, List
<MappedCdmBase
> resultList
, Reference sourceReference
) {
308 TaxonNameBase
<?
,?
> name
= null;
309 String strScientificName
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME
);
311 if (strScientificName
!= null){
312 name
= parser
.parseFullName(strScientificName
, nomCode
, rank
);
313 if ( rank
!= null && name
!= null && name
.getRank() != null && ! rank
.equals(name
.getRank())){
314 if (config
.isValidateRankConsistency()){
315 String message
= "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
316 message
= String
.format(message
, name
.getRank().getTitleCache(), strScientificName
, rank
.getTitleCache());
317 fireWarningEvent(message
, item
, 4);
320 checkAuthorship(name
, item
);
321 resultList
.add(new MappedCdmBase(TermUri
.DWC_SCIENTIFIC_NAME
, strScientificName
, name
));
324 String strScientificNameId
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_ID
);
325 if (strScientificNameId
!= null){
326 if (config
.isScientificNameIdAsOriginalSourceId()){
328 IdentifiableSource source
= IdentifiableSource
.NewDataImportInstance(strScientificNameId
,
329 TermUri
.DWC_SCIENTIFIC_NAME_ID
.toString(), sourceReference
);
330 name
.addSource(source
);
333 String message
= "ScientificNameId not yet implemented: '%s'";
334 message
= String
.format(message
, strScientificNameId
);
335 fireWarningEvent(message
, item
, 4);
340 TermUri idTerm
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
;
341 TermUri strTerm
= TermUri
.DWC_NAME_PUBLISHED_IN
;
342 MappedCdmBase
<Reference
> nomRef
= getReference(item
, resultList
, idTerm
, strTerm
, false);
346 name
.setNomenclaturalReference(nomRef
.getCdmBase()); //check if name already has a nomRef, shouldn't be the case usually
350 String message
= "NamePublishedIn information available but no name exists";
351 fireWarningEvent(message
, item
, 4);
359 * General method to handle references used for multiple attributes.
364 * @param idIsInternal
367 private MappedCdmBase
<Reference
> getReference(CsvStreamItem item
, List
<MappedCdmBase
> resultList
, TermUri idTerm
, TermUri strTerm
, boolean idIsInternal
) {
368 Reference
<?
> newRef
= null;
369 Reference
<?
> sourceCitation
= null;
371 MappedCdmBase
<Reference
> result
= null;
372 if (exists(idTerm
, item
) || exists(strTerm
, item
)){
373 String refId
= CdmUtils
.Nz(item
.get(idTerm
)).trim();
374 String refStr
= CdmUtils
.Nz(item
.get(strTerm
)).trim();
375 if (StringUtils
.isNotBlank(refId
)){
376 List
<Reference
> references
= state
.get(idTerm
.toString(), refId
, Reference
.class);
377 if (references
.size() == 0){
379 //references should already exist in store if not linking to external links like URLs
380 String message
= "External namePublishedInIDs are not yet supported";
381 fireWarningEvent(message
, item
, 4);
383 newRef
= ReferenceFactory
.newGeneric(); //TODO handle other types if possible
384 newRef
.addImportSource(refId
, idTerm
.toString(), sourceCitation
, null);
385 MappedCdmBase
<Reference
> idResult
= new MappedCdmBase
<Reference
>(idTerm
, refId
, newRef
);
386 resultList
.add(idResult
);
389 //TODO handle list.size > 1 , do we need a list here ?
390 result
= new MappedCdmBase
<Reference
>(idTerm
, refId
, references
.get(0));
394 List
<Reference
> nomRefs
= state
.get(strTerm
.toString(), refStr
, Reference
.class);
395 if (nomRefs
.size() > 0){
396 //TODO handle list.size > 1 , do we need a list here ?
397 result
= new MappedCdmBase
<Reference
>(strTerm
, refStr
, nomRefs
.get(0));
401 newRef
= ReferenceFactory
.newGeneric(); //TODO handle other types if possible
403 newRef
.setTitleCache(refStr
, true);
404 //TODO distinguish available year, authorship, etc. if
405 result
= new MappedCdmBase
<Reference
>(strTerm
, refStr
, newRef
);
406 resultList
.add(result
);
414 //TODO we may configure in configuration that scientific name never includes Authorship
415 private void checkAuthorship(TaxonNameBase nameBase
, CsvStreamItem item
) {
416 if (!nameBase
.isInstanceOf(NonViralName
.class)){
419 NonViralName
<?
> nvName
= CdmBase
.deproxy(nameBase
, NonViralName
.class);
420 String strAuthors
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_AUTHORS
);
422 if (! nvName
.isProtectedTitleCache()){
423 if (StringUtils
.isBlank(nvName
.getAuthorshipCache())){
424 if (nvName
.isInstanceOf(BotanicalName
.class) || nvName
.isInstanceOf(ZoologicalName
.class)){
425 //TODO can't we also parse NonViralNames correctly ?
427 parser
.parseAuthors(nvName
, strAuthors
);
428 } catch (StringNotParsableException e
) {
429 nvName
.setAuthorshipCache(strAuthors
);
432 nvName
.setAuthorshipCache(strAuthors
);
434 //TODO throw warning (scientific name should always include authorship) by DwC definition
441 private Rank
getRank(CsvStreamItem csvTaxonRecord
, NomenclaturalCode nomCode
) {
442 boolean USE_UNKNOWN
= true;
444 String strRank
= getValue(csvTaxonRecord
,TermUri
.DWC_TAXON_RANK
);
445 String strVerbatimRank
= getValue(csvTaxonRecord
,TermUri
.DWC_VERBATIM_TAXON_RANK
);
446 if (strRank
!= null){
448 rank
= Rank
.getRankByEnglishName(strRank
, nomCode
, USE_UNKNOWN
);
449 if (rank
.equals(Rank
.UNKNOWN_RANK())){
450 rank
= Rank
.getRankByNameOrAbbreviation(strRank
, USE_UNKNOWN
);
451 if (rank
.equals(Rank
.UNKNOWN_RANK())){
452 String message
= "Rank can not be defined for '%s'";
453 message
= String
.format(message
, strRank
);
454 fireWarningEvent(message
, csvTaxonRecord
, 4);
457 } catch (UnknownCdmTypeException e
) {
458 //should not happen as USE_UNKNOWN is used
459 rank
= Rank
.UNKNOWN_RANK();
462 if ( (rank
== null || rank
.equals(Rank
.UNKNOWN_RANK())) && strVerbatimRank
!= null){
464 rank
= Rank
.getRankByNameOrAbbreviation(strVerbatimRank
, USE_UNKNOWN
);
465 if (rank
.equals(Rank
.UNKNOWN_RANK())){
466 String message
= "Rank can not be defined for '%s'";
467 message
= String
.format(message
, strVerbatimRank
);
468 fireWarningEvent(message
, csvTaxonRecord
, 4);
470 } catch (UnknownCdmTypeException e
) {
471 //should not happen as USE_UNKNOWN is used
472 rank
= Rank
.UNKNOWN_RANK();
480 * Creates an empty taxon object with a given status.
484 private TaxonBase
<?
> getTaxonBase(CsvStreamItem item
) {
485 TaxonNameBase
<?
,?
> name
= null;
486 Reference
<?
> sec
= null;
488 String taxStatus
= item
.get(TermUri
.DWC_TAXONOMIC_STATUS
);
491 if (taxStatus
!= null){
492 if (taxStatus
.matches("accepted.*|valid")){
494 } else if (taxStatus
.matches(".*synonym|invalid|not accepted")){ //not accepted comes from scratchpads
496 } else if (taxStatus
.matches("misapplied.*")){
501 item
.remove(TermUri
.DWC_TAXONOMIC_STATUS
);
503 if (! CdmUtils
.isBlank(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
504 // acceptedNameUsageId = id
505 if (getSourceId(item
).equals(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
511 if (status
.contains("A") || status
.contains("M")){
512 result
= Taxon
.NewInstance(name
, sec
);
513 if (status
.contains("S") && ! status
.contains("M") ){
514 String message
= "Ambigous taxon status (%s)";
515 message
= String
.format(message
, status
);
516 fireWarningEvent(message
, item
, 6);
518 } else if (status
.contains("S")){
519 result
= Synonym
.NewInstance(name
, sec
);
521 result
= Taxon
.NewUnknownStatusInstance(name
, sec
);
528 // ********************** PARTITIONABLE ****************************************/
532 protected void makeForeignKeysForItem(CsvStreamItem item
, Map
<String
, Set
<String
>> fkMap
) {
537 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
.toString()))){
538 Set
<String
> keySet
= getKeySet(key
, fkMap
);
541 if (config
.isDeduplicateNamePublishedIn()){
542 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN
.toString()))){
543 Set
<String
> keySet
= getKeySet(key
, fkMap
);
549 if (! config
.isDatasetsAsSecundumReference()){
550 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO_ID
.toString()))){
551 Set
<String
> keySet
= getKeySet(key
, fkMap
);
554 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO
.toString()))){
555 Set
<String
> keySet
= getKeySet(key
, fkMap
);
561 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_DATASET_ID
.toString()))){
562 Set
<String
> keySet
= getKeySet(key
, fkMap
);
565 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_DATASET_NAME
.toString()))){
566 Set
<String
> keySet
= getKeySet(key
, fkMap
);
574 public Set
<String
> requiredSourceNamespaces() {
575 Set
<String
> result
= new HashSet
<String
>();
576 result
.add(TermUri
.DWC_NAME_PUBLISHED_IN_ID
.toString());
577 result
.add(TermUri
.DWC_NAME_PUBLISHED_IN
.toString());
578 if (!config
.isDatasetsAsSecundumReference()){
579 result
.add(TermUri
.DWC_NAME_ACCORDING_TO_ID
.toString());
580 result
.add(TermUri
.DWC_NAME_ACCORDING_TO
.toString());
582 result
.add(TermUri
.DWC_DATASET_ID
.toString());
583 result
.add(TermUri
.DWC_DATASET_NAME
.toString());
587 //** ***************************** TO STRING *********************************************/
590 public String
toString(){
591 return this.getClass().getName();