3 * Copyright (C) 2009 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.io
.dwca
.in
;
12 import java
.util
.ArrayList
;
13 import java
.util
.List
;
17 import org
.apache
.commons
.lang
.StringUtils
;
18 import org
.apache
.log4j
.Logger
;
20 import com
.ibm
.lsid
.MalformedLSIDException
;
22 import eu
.etaxonomy
.cdm
.common
.CdmUtils
;
23 import eu
.etaxonomy
.cdm
.io
.dwca
.TermUri
;
24 import eu
.etaxonomy
.cdm
.model
.common
.CdmBase
;
25 import eu
.etaxonomy
.cdm
.model
.common
.IdentifiableSource
;
26 import eu
.etaxonomy
.cdm
.model
.common
.LSID
;
27 import eu
.etaxonomy
.cdm
.model
.name
.NomenclaturalCode
;
28 import eu
.etaxonomy
.cdm
.model
.name
.NonViralName
;
29 import eu
.etaxonomy
.cdm
.model
.name
.Rank
;
30 import eu
.etaxonomy
.cdm
.model
.name
.TaxonNameBase
;
31 import eu
.etaxonomy
.cdm
.model
.reference
.Reference
;
32 import eu
.etaxonomy
.cdm
.model
.reference
.ReferenceFactory
;
33 import eu
.etaxonomy
.cdm
.model
.taxon
.Classification
;
34 import eu
.etaxonomy
.cdm
.model
.taxon
.Synonym
;
35 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
36 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
37 import eu
.etaxonomy
.cdm
.strategy
.exceptions
.UnknownCdmTypeException
;
38 import eu
.etaxonomy
.cdm
.strategy
.parser
.INonViralNameParser
;
39 import eu
.etaxonomy
.cdm
.strategy
.parser
.NonViralNameParserImpl
;
46 public class DwcTaxonCsv2CdmTaxonConverter
extends PartitionableConverterBase
<DwcaImportState
> implements IPartitionableConverter
<CsvStreamItem
, IReader
<CdmBase
>, String
>{
47 @SuppressWarnings("unused")
48 private static Logger logger
= Logger
.getLogger(DwcTaxonCsv2CdmTaxonConverter
.class);
50 private static final String ID
= "id";
51 // key for for case that no dataset information is supplied, TODO use something better
52 public static final String NO_DATASET
= "no_dataset_jli773oebhjklw";
58 public DwcTaxonCsv2CdmTaxonConverter(DwcaImportState state
) {
64 public IReader
<MappedCdmBase
> map(CsvStreamItem csvTaxonRecord
){
65 List
<MappedCdmBase
> resultList
= new ArrayList
<MappedCdmBase
>();
67 Reference
<?
> sourceReference
= null;
68 String sourceReferenceDetail
= null;
71 TaxonBase
<?
> taxonBase
= getTaxonBase(csvTaxonRecord
);
72 MappedCdmBase mcb
= new MappedCdmBase(csvTaxonRecord
.term
, csvTaxonRecord
.get(ID
), taxonBase
);
76 String id
= csvTaxonRecord
.get(ID
);
77 IdentifiableSource source
= taxonBase
.addSource(id
, "Taxon", sourceReference
, sourceReferenceDetail
);
78 MappedCdmBase mappedSource
= new MappedCdmBase(csvTaxonRecord
.get(ID
), source
);
79 resultList
.add(mappedSource
);
80 csvTaxonRecord
.remove(ID
);
83 NomenclaturalCode nomCode
= getNomCode(csvTaxonRecord
);
84 Rank rank
= getRank(csvTaxonRecord
, nomCode
);
87 TaxonNameBase
<?
,?
> name
= getScientificName(csvTaxonRecord
, nomCode
, rank
, resultList
);
88 taxonBase
.setName(name
);
91 Reference
<?
> sec
= getNameAccordingTo(csvTaxonRecord
, resultList
);
92 taxonBase
.setSec(sec
);
95 handleDataset(csvTaxonRecord
, resultList
, sourceReference
, sourceReferenceDetail
);
98 //term="http://purl.org/dc/terms/identifier"
99 //currently only LSIDs
100 handleIdentifier(csvTaxonRecord
, taxonBase
);
104 // <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
105 // The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
106 // Fungi, Plantae, Protozoa, Viruses -->
107 // <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
109 // <!-- Phylum in which the taxon has been classified -->
110 // <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
112 // <!-- Class in which the taxon has been classified -->
113 // <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
115 // <!-- Order in which the taxon has been classified -->
116 // <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
118 // <!-- Family in which the taxon has been classified -->
119 // <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
121 // <!-- Genus in which the taxon has been classified -->
122 // <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
124 // <!-- Subgenus in which the taxon has been classified -->
125 // <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
126 // <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
128 // <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
129 // <!-- Infraspecific epithet -->
131 // <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
132 // <!-- Authorship -->
134 // <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
135 // ==> see scientific name
137 // <!-- Acceptance status published in -->
138 // <field index='20' term='http://purl.org/dc/terms/source'/>
139 // <!-- Reference in which the scientific name was first published -->
140 // <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
141 // <!-- Taxon scrutinized by -->
142 // <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/>
143 // <!-- Scrutiny date -->
144 // <field index='23' term='http://purl.org/dc/terms/modified'/>
145 // <!-- Additional data for the taxon -->
146 // <field index='24' term='http://purl.org/dc/terms/description'/>
149 return new ListReader
<MappedCdmBase
>(resultList
);
154 //TODO handle non LSIDs
155 //TODO handle LSIDs for names
156 private void handleIdentifier(CsvStreamItem csvTaxonRecord
, TaxonBase
<?
> taxonBase
) {
157 String identifier
= csvTaxonRecord
.get(TermUri
.DC_IDENTIFIER
);
158 if (StringUtils
.isNotBlank(identifier
)){
159 if (identifier
.trim().startsWith("urn:lsid")){
161 LSID lsid
= new LSID(identifier
);
162 taxonBase
.setLsid(lsid
);
163 } catch (MalformedLSIDException e
) {
164 String message
= "LSID is malformed and can't be handled as LSID: %s";
165 message
= String
.format(message
, identifier
);
166 fireWarningEvent(message
, csvTaxonRecord
, 4);
169 String message
= "Identifier type not supported: %s";
170 message
= String
.format(message
, identifier
);
171 fireWarningEvent(message
, csvTaxonRecord
, 4);
178 private void handleDataset(CsvStreamItem csvTaxonRecord
, List
<MappedCdmBase
> resultList
, Reference
<?
> sourceReference
, String sourceReferecenDetail
) {
179 String datasetId
= CdmUtils
.Nz(csvTaxonRecord
.get(TermUri
.DWC_DATASET_ID
)).trim();
180 String datasetName
= CdmUtils
.Nz(csvTaxonRecord
.get(TermUri
.DWC_DATASET_NAME
)).trim();
181 if (CdmUtils
.areBlank(datasetId
, datasetName
) ){
182 datasetId
= NO_DATASET
;
186 boolean classificationExists
= state
.exists(TermUri
.DWC_DATASET_ID
.toString() , datasetId
, Classification
.class);
189 if (!classificationExists
){
190 classificationExists
= state
.exists(TermUri
.DWC_DATASET_NAME
.toString() , datasetName
, Classification
.class);
193 //if not exists, create new
194 if (! classificationExists
){
195 String classificationName
= StringUtils
.isBlank(datasetName
)? datasetId
: datasetName
;
196 if (classificationName
.equals(NO_DATASET
)){
197 classificationName
= "Classification (no name)"; //TODO define by config or zipfile or metadata
200 String classificationId
= StringUtils
.isBlank(datasetId
)? datasetName
: datasetId
;
201 Classification classification
= Classification
.NewInstance(classificationName
);
203 IdentifiableSource source
= classification
.addSource(classificationId
, "Dataset", sourceReference
, sourceReferecenDetail
);
205 resultList
.add(new MappedCdmBase(TermUri
.DWC_DATASET_ID
, datasetId
, classification
));
206 resultList
.add(new MappedCdmBase(TermUri
.DWC_DATASET_NAME
, datasetName
, classification
));
207 resultList
.add(new MappedCdmBase(source
));
208 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
209 state
.putMapping(TermUri
.DWC_DATASET_ID
.toString(), classificationId
, classification
);
210 state
.putMapping(TermUri
.DWC_DATASET_NAME
.toString(), classificationName
, classification
);
213 //remove to later check if all attributes were used
214 csvTaxonRecord
.remove(TermUri
.DWC_DATASET_ID
);
215 csvTaxonRecord
.remove(TermUri
.DWC_DATASET_NAME
);
221 public String
getSourceId(CsvStreamItem item
) {
222 String id
= item
.get(ID
);
226 private Reference
<?
> getNameAccordingTo(CsvStreamItem item
, List
<MappedCdmBase
> resultList
) {
227 TermUri idTerm
= TermUri
.DWC_NAME_ACCORDING_TO_ID
;
228 TermUri strTerm
= TermUri
.DWC_NAME_ACCORDING_TO
;
229 Reference
<?
> secRef
= handleReference(item
, resultList
, idTerm
, strTerm
);
234 private NomenclaturalCode
getNomCode(CsvStreamItem item
) {
235 String strNomCode
= getValue(item
, TermUri
.DWC_NOMENCLATURAL_CODE
);
236 NomenclaturalCode nomCode
= null;
237 // by Nomcenclatural Code
238 if (strNomCode
!= null){
239 nomCode
= NomenclaturalCode
.fromString(strNomCode
);
240 if (nomCode
== null){
241 String message
= "NomCode '%s' not recognized";
242 message
= String
.format(message
, strNomCode
);
243 fireWarningEvent(message
, item
, 4);
249 String strKingdom
= getValue(item
, TermUri
.DWC_KINGDOM
);
250 if (strKingdom
.equalsIgnoreCase("Plantae")){
251 nomCode
= NomenclaturalCode
.ICBN
;
252 }else if (strKingdom
.equalsIgnoreCase("Animalia")){
253 nomCode
= NomenclaturalCode
.ICZN
;
254 }else if (strKingdom
.equalsIgnoreCase("Fungi")){
255 nomCode
= NomenclaturalCode
.ICBN
;
257 //TODO further kingdoms
258 if (nomCode
== null){
265 private TaxonNameBase
<?
,?
> getScientificName(CsvStreamItem item
, NomenclaturalCode nomCode
, Rank rank
, List
<MappedCdmBase
> resultList
) {
266 TaxonNameBase
<?
,?
> name
= null;
267 String strScientificName
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME
);
269 if (strScientificName
!= null){
270 INonViralNameParser
<?
> parser
= NonViralNameParserImpl
.NewInstance();
271 name
= parser
.parseFullName(strScientificName
, nomCode
, rank
);
272 if (rank
!= null && name
!= null && name
.getRank() != null &&
273 ! rank
.equals(name
.getRank())){
274 String message
= "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
275 message
= String
.format(message
, name
.getRank().getTitleCache(), strScientificName
, rank
.getTitleCache());
276 fireWarningEvent(message
, item
, 4);
278 checkAuthorship(name
, item
);
279 resultList
.add(new MappedCdmBase(TermUri
.DWC_SCIENTIFIC_NAME
, strScientificName
, name
));
282 String strScientificNameId
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_ID
);
283 if (strScientificNameId
!= null){
284 String message
= "ScientificNameId not yet implemented: '%s'";
285 message
= String
.format(message
, strScientificNameId
);
286 fireWarningEvent(message
, item
, 4);
290 TermUri idTerm
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
;
291 TermUri strTerm
= TermUri
.DWC_NAME_PUBLISHED_IN
;
292 Reference
<?
> nomRef
= handleReference(item
, resultList
, idTerm
, strTerm
);
296 name
.setNomenclaturalReference(nomRef
); //check if name already has a nomRef, shouldn't be the case usually
300 String message
= "NamePublishedIn information available but no name exists";
301 fireWarningEvent(message
, item
, 4);
308 private Reference
<?
> handleReference(CsvStreamItem item
, List
<MappedCdmBase
> resultList
, TermUri idTerm
, TermUri strTerm
) {
310 Reference result
= null;
311 if (exists(idTerm
, item
) || exists(strTerm
, item
)){
312 String nomRefId
= CdmUtils
.Nz(item
.get(idTerm
)).trim();
313 String nomRefStr
= CdmUtils
.Nz(item
.get(strTerm
)).trim();
314 if (StringUtils
.isNotBlank(nomRefId
)){
315 List
<Reference
> nomRefs
= state
.get(idTerm
.toString(), nomRefId
, Reference
.class);
316 if (nomRefs
.size() == 0){
317 //references should already exist in store if not linking to external links like URLs
318 String message
= "External namePublishedInIDs are not yet supported";
319 fireWarningEvent(message
, item
, 4);
321 //TODO handle list.size > 1 , do we need a list here ?
322 result
= nomRefs
.get(0);
326 List
<Reference
> nomRefs
= state
.get(strTerm
.toString(), nomRefStr
, Reference
.class);
327 if (nomRefs
.size() > 0){
328 //TODO handle list.size > 1 , do we need a list here ?
329 result
= nomRefs
.get(0);
332 result
= ReferenceFactory
.newGeneric(); //TODO handle other types if possible
333 result
.setTitleCache(nomRefStr
, true);
334 //TODO distinguish available year, authorship, etc. if
335 resultList
.add(new MappedCdmBase(strTerm
, nomRefStr
, result
));
343 //TODO we may configure in configuration that scientific name never includes Authorship
344 private void checkAuthorship(TaxonNameBase nameBase
, CsvStreamItem item
) {
345 if (!nameBase
.isInstanceOf(NonViralName
.class)){
348 NonViralName
<?
> nvName
= CdmBase
.deproxy(nameBase
, NonViralName
.class);
349 String strAuthors
= getValue(item
, TermUri
.DWC_SCIENTIFIC_NAME_AUTHORS
);
351 if (! nvName
.isProtectedTitleCache()){
352 if (StringUtils
.isBlank(nvName
.getAuthorshipCache())){
353 //TODO some more sophisticated stuff can be done here like parsing etc.
354 nvName
.setAuthorshipCache(strAuthors
);
355 //TODO warning (scientific name should always include authorship)
362 private Rank
getRank(CsvStreamItem csvTaxonRecord
, NomenclaturalCode nomCode
) {
363 boolean USE_UNKNOWN
= true;
365 String strRank
= getValue(csvTaxonRecord
,TermUri
.DWC_TAXON_RANK
);
366 String strVerbatimRank
= getValue(csvTaxonRecord
,TermUri
.DWC_VERBATIM_TAXON_RANK
);
367 if (strRank
!= null){
369 rank
= Rank
.getRankByEnglishName(strRank
, nomCode
, USE_UNKNOWN
);
370 if (rank
.equals(Rank
.UNKNOWN_RANK())){
371 rank
= Rank
.getRankByNameOrAbbreviation(strRank
, USE_UNKNOWN
);
372 if (rank
.equals(Rank
.UNKNOWN_RANK())){
373 String message
= "Rank can not be defined for '%s'";
374 message
= String
.format(message
, strRank
);
375 fireWarningEvent(message
, csvTaxonRecord
, 4);
378 } catch (UnknownCdmTypeException e
) {
379 //should not happen as USE_UNKNOWN is used
380 rank
= Rank
.UNKNOWN_RANK();
383 if ( (rank
== null || rank
.equals(Rank
.UNKNOWN_RANK())) && strVerbatimRank
!= null){
385 rank
= Rank
.getRankByNameOrAbbreviation(strVerbatimRank
, USE_UNKNOWN
);
386 if (rank
.equals(Rank
.UNKNOWN_RANK())){
387 String message
= "Rank can not be defined for '%s'";
388 message
= String
.format(message
, strVerbatimRank
);
389 fireWarningEvent(message
, csvTaxonRecord
, 4);
391 } catch (UnknownCdmTypeException e
) {
392 //should not happen as USE_UNKNOWN is used
393 rank
= Rank
.UNKNOWN_RANK();
400 private TaxonBase
<?
> getTaxonBase(CsvStreamItem item
) {
401 TaxonNameBase
<?
,?
> name
= null;
402 Reference
<?
> sec
= null;
404 String taxStatus
= item
.get(TermUri
.DWC_TAXONOMIC_STATUS
);
406 boolean isMissaplied
= false;
407 if (taxStatus
!= null){
408 if (taxStatus
.matches("accepted|valid")){
410 }else if (taxStatus
.matches(".*synonym|invalid")){
412 }if (taxStatus
.matches("misapplied")){
417 item
.remove(TermUri
.DWC_TAXONOMIC_STATUS
);
419 if (! CdmUtils
.isBlank(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
420 // acceptedNameUsageId = id
421 if (getSourceId(item
).equals(item
.get(TermUri
.DWC_ACCEPTED_NAME_USAGE_ID
))){
427 if (status
.contains("A") || status
.contains("M")){
428 result
= Taxon
.NewInstance(name
, sec
);
429 if (status
.contains("S") && ! status
.contains("M") ){
430 String message
= "Ambigous taxon status (%s)";
431 message
= String
.format(message
, status
);
432 fireWarningEvent(message
, item
, 6);
434 }else if (status
.contains("S")){
435 result
= Synonym
.NewInstance(name
, sec
);
437 result
= Taxon
.NewUnknownStatusInstance(name
, sec
);
444 // ********************** PARTITIONABLE ****************************************/
448 protected void makeForeignKeysForItem(CsvStreamItem item
, Map
<String
, Set
<String
>> fkMap
) {
453 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN_ID
.toString()))){
454 Set
<String
> keySet
= getKeySet(key
, fkMap
);
457 if (state
.getConfig().isDeduplicateNamePublishedIn()){
458 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_PUBLISHED_IN
.toString()))){
459 Set
<String
> keySet
= getKeySet(key
, fkMap
);
465 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO_ID
.toString()))){
466 Set
<String
> keySet
= getKeySet(key
, fkMap
);
469 if ( hasValue(value
= item
.get(key
= TermUri
.DWC_NAME_ACCORDING_TO
.toString()))){
470 Set
<String
> keySet
= getKeySet(key
, fkMap
);
476 //** ***************************** TO STRING *********************************************/
479 public String
toString(){
480 return this.getClass().getName();