3 * Copyright (C) 2009 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.io
.dwca
.in
;
12 import java
.util
.ArrayList
;
13 import java
.util
.HashSet
;
14 import java
.util
.List
;
18 import org
.apache
.commons
.lang
.StringUtils
;
19 import org
.apache
.log4j
.Logger
;
21 import com
.ibm
.lsid
.MalformedLSIDException
;
23 import eu
.etaxonomy
.cdm
.common
.CdmUtils
;
24 import eu
.etaxonomy
.cdm
.io
.dwca
.TermUri
;
25 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
26 import eu
.etaxonomy
.cdm
.model
.common
.IdentifiableSource
;
27 import eu
.etaxonomy
.cdm
.model
.common
.LSID
;
28 import eu
.etaxonomy
.cdm
.model
.name
.BotanicalName
;
29 import eu
.etaxonomy
.cdm
.model
.name
.NomenclaturalCode
;
30 import eu
.etaxonomy
.cdm
.model
.name
.NonViralName
;
31 import eu
.etaxonomy
.cdm
.model
.name
.Rank
;
32 import eu
.etaxonomy
.cdm
.model
.name
.TaxonNameBase
;
33 import eu
.etaxonomy
.cdm
.model
.name
.ZoologicalName
;
34 import eu
.etaxonomy
.cdm
.model
.reference
.Reference
;
35 import eu
.etaxonomy
.cdm
.model
.reference
.ReferenceFactory
;
36 import eu
.etaxonomy
.cdm
.model
.taxon
.Classification
;
37 import eu
.etaxonomy
.cdm
.model
.taxon
.Synonym
;
38 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
40 import eu
.etaxonomy
.cdm
.strategy
.exceptions
.StringNotParsableException
;
41 import eu
.etaxonomy
.cdm
.strategy
.exceptions
.UnknownCdmTypeException
;
42 import eu
.etaxonomy
.cdm
.strategy
.parser
.NonViralNameParserImpl
;
49 public class DwcTaxonCsv2CdmTaxonConverter
extends PartitionableConverterBase
<DwcaImportState
> implements IPartitionableConverter
<CsvStreamItem
, IReader
<CdmBase
>, String
>{
50 @SuppressWarnings("unused")
51 private static Logger logger
= Logger
.getLogger(DwcTaxonCsv2CdmTaxonConverter
.class);
53 private static final String ID
= "id";
54 // temporary key for the case that no dataset information is supplied, TODO use something better
55 public static final String NO_DATASET
= "no_dataset_jli773oebhjklw";
57 private NonViralNameParserImpl parser
= NonViralNameParserImpl
.NewInstance();
62 public DwcTaxonCsv2CdmTaxonConverter(DwcaImportState state
) {
67 public IReader
<MappedCdmBase
> map(CsvStreamItem csvTaxonRecord
){
68 List
<MappedCdmBase
> resultList
= new ArrayList
<MappedCdmBase
>();
70 //TODO what if not transactional?
71 Reference
<?
> sourceReference
= state
.getTransactionalSourceReference();
72 String sourceReferenceDetail
= null;
75 TaxonBase
<?
> taxonBase
= getTaxonBase(csvTaxonRecord
);
76 MappedCdmBase mcb
= new MappedCdmBase(csvTaxonRecord
.term
, csvTaxonRecord
.get(ID
), taxonBase
);
80 String id
= csvTaxonRecord
.get(ID
);
81 IdentifiableSource source
= taxonBase
.addSource(id
, "Taxon", sourceReference
, sourceReferenceDetail
);
82 MappedCdmBase mappedSource
= new MappedCdmBase(csvTaxonRecord
.get(ID
), source
);
83 resultList
.add(mappedSource
);
84 csvTaxonRecord
.remove(ID
);
87 NomenclaturalCode nomCode
= getNomCode(csvTaxonRecord
);
88 Rank rank
= getRank(csvTaxonRecord
, nomCode
);
90 //name && name published in
91 TaxonNameBase
<?
,?
> name
= getScientificName(csvTaxonRecord
, nomCode
, rank
, resultList
, sourceReference
);
92 taxonBase
.setName(name
);
95 MappedCdmBase
<Reference
> sec
= getNameAccordingTo(csvTaxonRecord
, resultList
);
96 if (sec
== null && state
.getConfig().isUseSourceReferenceAsSec()){
97 sec
= new MappedCdmBase
<Reference
>(state
.getTransactionalSourceReference());
100 taxonBase
.setSec(sec
.getCdmBase());
104 handleDataset(csvTaxonRecord
, taxonBase
, resultList
, sourceReference
, sourceReferenceDetail
);
107 //term="http://purl.org/dc/terms/identifier"
108 //currently only LSIDs
109 handleIdentifier(csvTaxonRecord
, taxonBase
);
113 // <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
114 // The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
115 // Fungi, Plantae, Protozoa, Viruses -->
116 // <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
118 // <!-- Phylum in which the taxon has been classified -->
119 // <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
121 // <!-- Class in which the taxon has been classified -->
122 // <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
124 // <!-- Order in which the taxon has been classified -->
125 // <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
127 // <!-- Family in which the taxon has been classified -->
128 // <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
130 // <!-- Genus in which the taxon has been classified -->
131 // <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
133 // <!-- Subgenus in which the taxon has been classified -->
134 // <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
135 // <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
137 // <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
138 // <!-- Infraspecific epithet -->
140 // <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
141 // <!-- Authorship -->
143 // <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
144 // ==> see scientific name
146 // <!-- Acceptance status published in -->
147 // <field index='20' term='http://purl.org/dc/terms/source'/>
148 // <!-- Reference in which the scientific name was first published -->
149 // <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
150 // <!-- Taxon scrutinized by -->
151 // <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/>
152 // <!-- Scrutiny date -->
153 // <field index='23' term='http://purl.org/dc/terms/modified'/>
154 // <!-- Additional data for the taxon -->
155 // <field index='24' term='http://purl.org/dc/terms/description'/>
158 return new ListReader
<MappedCdmBase
>(resultList
);
163 //TODO handle non LSIDs
164 //TODO handle LSIDs for names
165 private void handleIdentifier(CsvStreamItem csvTaxonRecord
, TaxonBase
<?
> taxonBase
) {
166 String identifier
= csvTaxonRecord
.get(TermUri
.DC_IDENTIFIER
);
167 if (StringUtils
.isNotBlank(identifier
)){
168 if (identifier
.trim().startsWith("urn:lsid")){
170 LSID lsid
= new LSID(identifier
);
171 taxonBase
.setLsid(lsid
);
172 } catch (MalformedLSIDException e
) {
173 String message
= "LSID is malformed and can't be handled as LSID: %s";
174 message
= String
.format(message
, identifier
);
175 fireWarningEvent(message
, csvTaxonRecord
, 4);
178 String message
= "Identifier type not supported: %s";
179 message
= String
.format(message
, identifier
);
180 fireWarningEvent(message
, csvTaxonRecord
, 4);
187 private void handleDataset(CsvStreamItem item
, TaxonBase
<?
> taxonBase
, List
<MappedCdmBase
> resultList
, Reference
<?
> sourceReference
, String sourceReferecenDetail
) {
188 TermUri idTerm
= TermUri
.DWC_DATASET_ID
;
189 TermUri strTerm
= TermUri
.DWC_DATASET_NAME
;
191 if (config
.isDatasetsAsClassifications()){
192 String datasetId
= CdmUtils
.Nz(item
.get(idTerm
)).trim();
193 String datasetName
= CdmUtils
.Nz(item
.get(strTerm
)).trim();
194 if (CdmUtils
.areBlank(datasetId
, datasetName
) ){
195 datasetId
= NO_DATASET
;
199 boolean classificationExists
= state
.exists(idTerm
.toString() , datasetId
, Classification
.class);
202 if (!classificationExists
){
203 classificationExists
= state
.exists(strTerm
.toString() , datasetName
, Classification
.class);
206 //if not exists, create new
207 if (! classificationExists
){
208 String classificationName
= StringUtils
.isBlank(datasetName
)? datasetId
: datasetName
;
209 if (classificationName
.equals(NO_DATASET
)){
210 classificationName
= "Classification (no name)"; //TODO define by config or zipfile or metadata
213 String classificationId
= StringUtils
.isBlank(datasetId
)? datasetName
: datasetId
;
214 Classification classification
= Classification
.NewInstance(classificationName
);
216 IdentifiableSource source
= classification
.addSource(classificationId
, "Dataset", sourceReference
, sourceReferecenDetail
);
218 resultList
.add(new MappedCdmBase(idTerm
, datasetId
, classification
));
219 resultList
.add(new MappedCdmBase(strTerm
, datasetName
, classification
));
220 resultList
.add(new MappedCdmBase(source
));
221 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
222 state
.putMapping(idTerm
.toString(), classificationId
, classification
);
223 state
.putMapping(strTerm
.toString(), classificationName
, classification
);
225 }else if (config
.isDatasetsAsSecundumReference() || config
.isDatasetsAsOriginalSource()){
226 MappedCdmBase
<Reference
> mappedCitation
= getReference(item
, resultList
, idTerm
, strTerm
, true);
227 if (mappedCitation
!= null){
228 Reference
<?
> ref
= mappedCitation
.getCdmBase();
229 if (config
.isDatasetsAsSecundumReference()){
230 //dataset as secundum reference
231 taxonBase
.setSec(ref
);
233 //dataset as original source
234 taxonBase
.addSource(null, null, ref
, null);
238 String message
= "DatasetUse type not yet implemented. Can't import dataset information.";
239 fireWarningEvent(message
, item
, 4);
242 //remove to later check if all attributes were used
244 item
.remove(strTerm
);
250 public String
getSourceId(CsvStreamItem item
) {
251 String id
= item
.get(ID
);
255 private MappedCdmBase
<Reference
> getNameAccordingTo(CsvStreamItem item
, List
<MappedCdmBase
> resultList
) {
256 if (config
.isDatasetsAsSecundumReference()){
257 //TODO store nameAccordingTo info some where else or let the user define where to store it.
260 TermUri idTerm
= TermUri
.DWC_NAME_ACCORDING_TO_ID
;
261 TermUri strTerm
= TermUri
.DWC_NAME_ACCORDING_TO
;
262 MappedCdmBase
<Reference
> secRef
= getReference(item
, resultList
, idTerm
, strTerm
, false);
267 private NomenclaturalCode
getNomCode(CsvStreamItem item
) {
268 String strNomCode
= getValue(item
, TermUri
.DWC_NOMENCLATURAL_CODE
);
269 NomenclaturalCode nomCode
= null;
270 // by Nomcenclatural Code
271 if (strNomCode
!= null){
272 nomCode
= NomenclaturalCode
.fromString(strNomCode
);
273 if (nomCode
== null){
274 String message
= "NomCode '%s' not recognized";
275 message
= String
.format(message
, strNomCode
);
276 fireWarningEvent(message
, item
, 4);
282 String strKingdom
= getValue(item
, TermUri
.DWC_KINGDOM
);
283 if (strKingdom
!= null){
284 if (strKingdom
.equalsIgnoreCase("Plantae")){
285 nomCode
= NomenclaturalCode
.ICNAFP
;
286 }else if (strKingdom
.equalsIgnoreCase("Fungi")){
287 nomCode
= NomenclaturalCode
.ICNAFP
;
288 }else if (strKingdom
.equalsIgnoreCase("Animalia")){
289 nomCode
= NomenclaturalCode
.ICZN
;
290 }else if (strKingdom
.equalsIgnoreCase("Protozoa")){
291 nomCode
= NomenclaturalCode
.ICZN
;
295 //TODO further kingdoms
296 if (nomCode
== null){
298 if (config
.getNomenclaturalCode() != null){
299 nomCode
= config
.getNomenclaturalCode();
306 private TaxonNameBase
<?
,?
> getScientificName(CsvStreamItem item
, NomenclaturalCode nomCode
, Rank rank
, List
<MappedCdmBase
> resultList
, Reference sourceReference
) {
307 TaxonNameBase
<?
,?
> name
= null;
308 String strScientificName
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME
);
310 if (strScientificName
!= null){
311 name
= parser
.parseFullName(strScientificName
, nomCode
, rank
);
312 if ( rank
!= null && name
!= null && name
.getRank() != null && ! rank
.equals(name
.getRank())){
313 if (config
.isValidateRankConsistency()){
314 String message
= "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
315 message
= String
.format(message
, name
.getRank().getTitleCache(), strScientificName
, rank
.getTitleCache());
316 fireWarningEvent(message
, item
, 4);
319 checkAuthorship(name
, item
);
320 resultList
.add(new MappedCdmBase(TermUri
.DWC_SCIENTIFIC_NAME
, strScientificName
, name
));
323 String strScientificNameId
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_ID
);
324 if (strScientificNameId
!= null){
325 if (config
.isScientificNameIdAsOriginalSourceId()){
327 IdentifiableSource source
= IdentifiableSource
.NewInstance(strScientificNameId
, TermUri
.DWC_SCIENTIFIC_NAME_ID
.toString(), sourceReference
, null);
328 name
.addSource(source
);
331 String message
= "ScientificNameId not yet implemented: '%s'";
332 message
= String
.format(message
, strScientificNameId
);
333 fireWarningEvent(message
, item
, 4);
338 TermUri idTerm
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
;
339 TermUri strTerm
= TermUri
.DWC_NAME_PUBLISHED_IN
;
340 MappedCdmBase
<Reference
> nomRef
= getReference(item
, resultList
, idTerm
, strTerm
, false);
344 name
.setNomenclaturalReference(nomRef
.getCdmBase()); //check if name already has a nomRef, shouldn't be the case usually
348 String message
= "NamePublishedIn information available but no name exists";
349 fireWarningEvent(message
, item
, 4);
357 * General method to handle references used for multiple attributes.
362 * @param idIsInternal
365 private MappedCdmBase
<Reference
> getReference(CsvStreamItem item
, List
<MappedCdmBase
> resultList
, TermUri idTerm
, TermUri strTerm
, boolean idIsInternal
) {
366 Reference
<?
> newRef
= null;
367 Reference
<?
> sourceCitation
= null;
369 MappedCdmBase
<Reference
> result
= null;
370 if (exists(idTerm
, item
) || exists(strTerm
, item
)){
371 String refId
= CdmUtils
.Nz(item
.get(idTerm
)).trim();
372 String refStr
= CdmUtils
.Nz(item
.get(strTerm
)).trim();
373 if (StringUtils
.isNotBlank(refId
)){
374 List
<Reference
> references
= state
.get(idTerm
.toString(), refId
, Reference
.class);
375 if (references
.size() == 0){
377 //references should already exist in store if not linking to external links like URLs
378 String message
= "External namePublishedInIDs are not yet supported";
379 fireWarningEvent(message
, item
, 4);
381 newRef
= ReferenceFactory
.newGeneric(); //TODO handle other types if possible
382 newRef
.addSource(refId
, idTerm
.toString(), sourceCitation
, null);
383 MappedCdmBase
<Reference
> idResult
= new MappedCdmBase
<Reference
>(idTerm
, refId
, newRef
);
384 resultList
.add(idResult
);
387 //TODO handle list.size > 1 , do we need a list here ?
388 result
= new MappedCdmBase
<Reference
>(idTerm
, refId
, references
.get(0));
392 List
<Reference
> nomRefs
= state
.get(strTerm
.toString(), refStr
, Reference
.class);
393 if (nomRefs
.size() > 0){
394 //TODO handle list.size > 1 , do we need a list here ?
395 result
= new MappedCdmBase
<Reference
>(strTerm
, refStr
, nomRefs
.get(0));
399 newRef
= ReferenceFactory
.newGeneric(); //TODO handle other types if possible
401 newRef
.setTitleCache(refStr
, true);
402 //TODO distinguish available year, authorship, etc. if
403 result
= new MappedCdmBase
<Reference
>(strTerm
, refStr
, newRef
);
404 resultList
.add(result
);
412 //TODO we may configure in configuration that scientific name never includes Authorship
413 private void checkAuthorship(TaxonNameBase nameBase
, CsvStreamItem item
) {
414 if (!nameBase
.isInstanceOf(NonViralName
.class)){
417 NonViralName
<?
> nvName
= CdmBase
.deproxy(nameBase
, NonViralName
.class);
418 String strAuthors
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_AUTHORS
);
420 if (! nvName
.isProtectedTitleCache()){
421 if (StringUtils
.isBlank(nvName
.getAuthorshipCache())){
422 if (nvName
.isInstanceOf(BotanicalName
.class) || nvName
.isInstanceOf(ZoologicalName
.class)){
423 //TODO can't we also parse NonViralNames correctly ?
425 parser
.parseAuthors(nvName
, strAuthors
);
426 } catch (StringNotParsableException e
) {
427 nvName
.setAuthorshipCache(strAuthors
);
430 nvName
.setAuthorshipCache(strAuthors
);
432 //TODO throw warning (scientific name should always include authorship) by DwC definition
439 private Rank
getRank(CsvStreamItem csvTaxonRecord
, NomenclaturalCode nomCode
) {
440 boolean USE_UNKNOWN
= true;
442 String strRank
= getValue(csvTaxonRecord
,TermUri
.DWC_TAXON_RANK
);
443 String strVerbatimRank
= getValue(csvTaxonRecord
,TermUri
.DWC_VERBATIM_TAXON_RANK
);
444 if (strRank
!= null){
446 rank
= Rank
.getRankByEnglishName(strRank
, nomCode
, USE_UNKNOWN
);
447 if (rank
.equals(Rank
.UNKNOWN_RANK())){
448 rank
= Rank
.getRankByNameOrAbbreviation(strRank
, USE_UNKNOWN
);
449 if (rank
.equals(Rank
.UNKNOWN_RANK())){
450 String message
= "Rank can not be defined for '%s'";
451 message
= String
.format(message
, strRank
);
452 fireWarningEvent(message
, csvTaxonRecord
, 4);
455 } catch (UnknownCdmTypeException e
) {
456 //should not happen as USE_UNKNOWN is used
457 rank
= Rank
.UNKNOWN_RANK();
460 if ( (rank
== null || rank
.equals(Rank
.UNKNOWN_RANK())) && strVerbatimRank
!= null){
462 rank
= Rank
.getRankByNameOrAbbreviation(strVerbatimRank
, USE_UNKNOWN
);
463 if (rank
.equals(Rank
.UNKNOWN_RANK())){
464 String message
= "Rank can not be defined for '%s'";
465 message
= String
.format(message
, strVerbatimRank
);
466 fireWarningEvent(message
, csvTaxonRecord
, 4);
468 } catch (UnknownCdmTypeException e
) {
469 //should not happen as USE_UNKNOWN is used
470 rank
= Rank
.UNKNOWN_RANK();
478 * Creates an empty taxon object with a given status.
482 private TaxonBase
<?
> getTaxonBase(CsvStreamItem item
) {
483 TaxonNameBase
<?
,?
> name
= null;
484 Reference
<?
> sec
= null;
486 String taxStatus
= item
.get(TermUri
.DWC_TAXONOMIC_STATUS
);
489 if (taxStatus
!= null){
490 if (taxStatus
.matches("accepted.*|valid")){
492 } else if (taxStatus
.matches(".*synonym|invalid|not accepted")){ //not accepted comes from scratchpads
494 } else if (taxStatus
.matches("misapplied.*")){
499 item
.remove(TermUri
.DWC_TAXONOMIC_STATUS
);
501 if (! CdmUtils
.isBlank(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
502 // acceptedNameUsageId = id
503 if (getSourceId(item
).equals(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
509 if (status
.contains("A") || status
.contains("M")){
510 result
= Taxon
.NewInstance(name
, sec
);
511 if (status
.contains("S") && ! status
.contains("M") ){
512 String message
= "Ambigous taxon status (%s)";
513 message
= String
.format(message
, status
);
514 fireWarningEvent(message
, item
, 6);
516 } else if (status
.contains("S")){
517 result
= Synonym
.NewInstance(name
, sec
);
519 result
= Taxon
.NewUnknownStatusInstance(name
, sec
);
526 // ********************** PARTITIONABLE ****************************************/
530 protected void makeForeignKeysForItem(CsvStreamItem item
, Map
<String
, Set
<String
>> fkMap
) {
535 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
.toString()))){
536 Set
<String
> keySet
= getKeySet(key
, fkMap
);
539 if (config
.isDeduplicateNamePublishedIn()){
540 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN
.toString()))){
541 Set
<String
> keySet
= getKeySet(key
, fkMap
);
547 if (! config
.isDatasetsAsSecundumReference()){
548 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO_ID
.toString()))){
549 Set
<String
> keySet
= getKeySet(key
, fkMap
);
552 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO
.toString()))){
553 Set
<String
> keySet
= getKeySet(key
, fkMap
);
559 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_DATASET_ID
.toString()))){
560 Set
<String
> keySet
= getKeySet(key
, fkMap
);
563 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_DATASET_NAME
.toString()))){
564 Set
<String
> keySet
= getKeySet(key
, fkMap
);
572 public Set
<String
> requiredSourceNamespaces() {
573 Set
<String
> result
= new HashSet
<String
>();
574 result
.add(TermUri
.DWC_NAME_PUBLISHED_IN_ID
.toString());
575 result
.add(TermUri
.DWC_NAME_PUBLISHED_IN
.toString());
576 if (!config
.isDatasetsAsSecundumReference()){
577 result
.add(TermUri
.DWC_NAME_ACCORDING_TO_ID
.toString());
578 result
.add(TermUri
.DWC_NAME_ACCORDING_TO
.toString());
580 result
.add(TermUri
.DWC_DATASET_ID
.toString());
581 result
.add(TermUri
.DWC_DATASET_NAME
.toString());
585 //** ***************************** TO STRING *********************************************/
588 public String
toString(){
589 return this.getClass().getName();