Updates to DwcA import (namePublishedIn, accordingTo, etc.)
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / dwca / in / DwcTaxonCsv2CdmTaxonConverter.java
1 // $Id$
2 /**
3 * Copyright (C) 2009 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.io.dwca.in;
11
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Set;
16
17 import org.apache.commons.lang.StringUtils;
18 import org.apache.log4j.Logger;
19
20 import com.ibm.lsid.MalformedLSIDException;
21
22 import eu.etaxonomy.cdm.common.CdmUtils;
23 import eu.etaxonomy.cdm.io.dwca.TermUri;
24 import eu.etaxonomy.cdm.model.common.CdmBase;
25 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
26 import eu.etaxonomy.cdm.model.common.LSID;
27 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
28 import eu.etaxonomy.cdm.model.name.NonViralName;
29 import eu.etaxonomy.cdm.model.name.Rank;
30 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
31 import eu.etaxonomy.cdm.model.reference.Reference;
32 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
33 import eu.etaxonomy.cdm.model.taxon.Classification;
34 import eu.etaxonomy.cdm.model.taxon.Synonym;
35 import eu.etaxonomy.cdm.model.taxon.Taxon;
36 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
37 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
38 import eu.etaxonomy.cdm.strategy.parser.INonViralNameParser;
39 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
40
41 /**
42 * @author a.mueller
43 * @date 22.11.2011
44 *
45 */
46 public class DwcTaxonCsv2CdmTaxonConverter extends PartitionableConverterBase<DwcaImportState> implements IPartitionableConverter<CsvStreamItem, IReader<CdmBase>, String>{
47 @SuppressWarnings("unused")
48 private static Logger logger = Logger.getLogger(DwcTaxonCsv2CdmTaxonConverter.class);
49
50 private static final String ID = "id";
51 // key for for case that no dataset information is supplied, TODO use something better
52 public static final String NO_DATASET = "no_dataset_jli773oebhjklw";
53
54
55 /**
56 * @param state
57 */
58 public DwcTaxonCsv2CdmTaxonConverter(DwcaImportState state) {
59 super();
60 this.state = state;
61 }
62
63
64 public IReader<MappedCdmBase> map(CsvStreamItem csvTaxonRecord){
65 List<MappedCdmBase> resultList = new ArrayList<MappedCdmBase>();
66
67 Reference<?> sourceReference = null;
68 String sourceReferenceDetail = null;
69
70 //taxon
71 TaxonBase<?> taxonBase = getTaxonBase(csvTaxonRecord);
72 MappedCdmBase mcb = new MappedCdmBase(csvTaxonRecord.term, csvTaxonRecord.get(ID), taxonBase);
73 resultList.add(mcb);
74
75 //original source
76 String id = csvTaxonRecord.get(ID);
77 IdentifiableSource source = taxonBase.addSource(id, "Taxon", sourceReference, sourceReferenceDetail);
78 MappedCdmBase mappedSource = new MappedCdmBase(csvTaxonRecord.get(ID), source);
79 resultList.add(mappedSource);
80 csvTaxonRecord.remove(ID);
81
82 //rank
83 NomenclaturalCode nomCode = getNomCode(csvTaxonRecord);
84 Rank rank = getRank(csvTaxonRecord, nomCode);
85
86 //name
87 TaxonNameBase<?,?> name = getScientificName(csvTaxonRecord, nomCode, rank, resultList);
88 taxonBase.setName(name);
89
90 //sec
91 Reference<?> sec = getNameAccordingTo(csvTaxonRecord, resultList);
92 taxonBase.setSec(sec);
93
94 //classification
95 handleDataset(csvTaxonRecord, resultList, sourceReference, sourceReferenceDetail);
96
97 //NON core
98 //term="http://purl.org/dc/terms/identifier"
99 //currently only LSIDs
100 handleIdentifier(csvTaxonRecord, taxonBase);
101
102
103
104 // <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
105 // The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
106 // Fungi, Plantae, Protozoa, Viruses -->
107 // <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
108
109 // <!-- Phylum in which the taxon has been classified -->
110 // <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
111
112 // <!-- Class in which the taxon has been classified -->
113 // <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
114
115 // <!-- Order in which the taxon has been classified -->
116 // <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
117
118 // <!-- Family in which the taxon has been classified -->
119 // <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
120
121 // <!-- Genus in which the taxon has been classified -->
122 // <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
123
124 // <!-- Subgenus in which the taxon has been classified -->
125 // <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
126 // <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
127
128 // <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
129 // <!-- Infraspecific epithet -->
130
131 // <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
132 // <!-- Authorship -->
133
134 // <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
135 // ==> see scientific name
136 //
137 // <!-- Acceptance status published in -->
138 // <field index='20' term='http://purl.org/dc/terms/source'/>
139 // <!-- Reference in which the scientific name was first published -->
140 // <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
141 // <!-- Taxon scrutinized by -->
142 // <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/> 
143 // <!-- Scrutiny date -->
144 // <field index='23' term='http://purl.org/dc/terms/modified'/>
145 // <!-- Additional data for the taxon -->
146 // <field index='24' term='http://purl.org/dc/terms/description'/>
147 // </core>
148
149 return new ListReader<MappedCdmBase>(resultList);
150 }
151
152
153
154 //TODO handle non LSIDs
155 //TODO handle LSIDs for names
156 private void handleIdentifier(CsvStreamItem csvTaxonRecord, TaxonBase<?> taxonBase) {
157 String identifier = csvTaxonRecord.get(TermUri.DC_IDENTIFIER);
158 if (StringUtils.isNotBlank(identifier)){
159 if (identifier.trim().startsWith("urn:lsid")){
160 try {
161 LSID lsid = new LSID(identifier);
162 taxonBase.setLsid(lsid);
163 } catch (MalformedLSIDException e) {
164 String message = "LSID is malformed and can't be handled as LSID: %s";
165 message = String.format(message, identifier);
166 fireWarningEvent(message, csvTaxonRecord, 4);
167 }
168 }else{
169 String message = "Identifier type not supported: %s";
170 message = String.format(message, identifier);
171 fireWarningEvent(message, csvTaxonRecord, 4);
172 }
173 }
174
175 }
176
177
178 private void handleDataset(CsvStreamItem csvTaxonRecord, List<MappedCdmBase> resultList, Reference<?> sourceReference, String sourceReferecenDetail) {
179 String datasetId = CdmUtils.Nz(csvTaxonRecord.get(TermUri.DWC_DATASET_ID)).trim();
180 String datasetName = CdmUtils.Nz(csvTaxonRecord.get(TermUri.DWC_DATASET_NAME)).trim();
181 if (CdmUtils.areBlank(datasetId, datasetName) ){
182 datasetId = NO_DATASET;
183 }
184
185 //check id
186 boolean classificationExists = state.exists(TermUri.DWC_DATASET_ID.toString() , datasetId, Classification.class);
187
188 //check name
189 if (!classificationExists){
190 classificationExists = state.exists(TermUri.DWC_DATASET_NAME.toString() , datasetName, Classification.class);
191 }
192
193 //if not exists, create new
194 if (! classificationExists){
195 String classificationName = StringUtils.isBlank(datasetName)? datasetId : datasetName;
196 if (classificationName.equals(NO_DATASET)){
197 classificationName = "Classification (no name)"; //TODO define by config or zipfile or metadata
198 }
199
200 String classificationId = StringUtils.isBlank(datasetId)? datasetName : datasetId;
201 Classification classification = Classification.NewInstance(classificationName);
202 //source
203 IdentifiableSource source = classification.addSource(classificationId, "Dataset", sourceReference, sourceReferecenDetail);
204 //add to result
205 resultList.add(new MappedCdmBase(TermUri.DWC_DATASET_ID, datasetId, classification));
206 resultList.add(new MappedCdmBase(TermUri.DWC_DATASET_NAME, datasetName, classification));
207 resultList.add(new MappedCdmBase(source));
208 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
209 state.putMapping(TermUri.DWC_DATASET_ID.toString(), classificationId, classification);
210 state.putMapping(TermUri.DWC_DATASET_NAME.toString(), classificationName, classification);
211 }
212
213 //remove to later check if all attributes were used
214 csvTaxonRecord.remove(TermUri.DWC_DATASET_ID);
215 csvTaxonRecord.remove(TermUri.DWC_DATASET_NAME);
216
217 }
218
219
220 @Override
221 public String getSourceId(CsvStreamItem item) {
222 String id = item.get(ID);
223 return id;
224 }
225
226 private Reference<?> getNameAccordingTo(CsvStreamItem item, List<MappedCdmBase> resultList) {
227 TermUri idTerm = TermUri.DWC_NAME_ACCORDING_TO_ID;
228 TermUri strTerm = TermUri.DWC_NAME_ACCORDING_TO;
229 Reference<?> secRef = handleReference(item, resultList, idTerm, strTerm);
230 return secRef;
231
232 }
233
234 private NomenclaturalCode getNomCode(CsvStreamItem item) {
235 String strNomCode = getValue(item, TermUri.DWC_NOMENCLATURAL_CODE);
236 NomenclaturalCode nomCode = null;
237 // by Nomcenclatural Code
238 if (strNomCode != null){
239 nomCode = NomenclaturalCode.fromString(strNomCode);
240 if (nomCode == null){
241 String message = "NomCode '%s' not recognized";
242 message = String.format(message, strNomCode);
243 fireWarningEvent(message, item, 4);
244 }else{
245 return nomCode;
246 }
247 }
248 // by Kingdom
249 String strKingdom = getValue(item, TermUri.DWC_KINGDOM);
250 if (strKingdom.equalsIgnoreCase("Plantae")){
251 nomCode = NomenclaturalCode.ICBN;
252 }else if (strKingdom.equalsIgnoreCase("Animalia")){
253 nomCode = NomenclaturalCode.ICZN;
254 }else if (strKingdom.equalsIgnoreCase("Fungi")){
255 nomCode = NomenclaturalCode.ICBN;
256 }
257 //TODO further kingdoms
258 if (nomCode == null){
259 //TODO warning
260 }
261 return nomCode;
262 }
263
264
265 private TaxonNameBase<?,?> getScientificName(CsvStreamItem item, NomenclaturalCode nomCode, Rank rank, List<MappedCdmBase> resultList) {
266 TaxonNameBase<?,?> name = null;
267 String strScientificName = getValue(item, TermUri.DWC_SCIENTIFIC_NAME);
268 //Name
269 if (strScientificName != null){
270 INonViralNameParser<?> parser = NonViralNameParserImpl.NewInstance();
271 name = parser.parseFullName(strScientificName, nomCode, rank);
272 if (rank != null && name != null && name.getRank() != null &&
273 ! rank.equals(name.getRank())){
274 String message = "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
275 message = String.format(message, name.getRank().getTitleCache(), strScientificName, rank.getTitleCache());
276 fireWarningEvent(message, item, 4);
277 }
278 checkAuthorship(name, item);
279 resultList.add(new MappedCdmBase(TermUri.DWC_SCIENTIFIC_NAME, strScientificName, name));
280 }
281 //By ID
282 String strScientificNameId = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_ID);
283 if (strScientificNameId != null){
284 String message = "ScientificNameId not yet implemented: '%s'";
285 message = String.format(message, strScientificNameId);
286 fireWarningEvent(message, item, 4);
287 }
288
289 //namePublishedIn
290 TermUri idTerm = TermUri.DWC_NAME_PUBLISHED_IN_ID;
291 TermUri strTerm = TermUri.DWC_NAME_PUBLISHED_IN;
292 Reference<?> nomRef = handleReference(item, resultList, idTerm, strTerm);
293
294 if (name != null){
295 if (nomRef != null){
296 name.setNomenclaturalReference(nomRef); //check if name already has a nomRef, shouldn't be the case usually
297 }
298 }else{
299 if (nomRef != null){
300 String message = "NamePublishedIn information available but no name exists";
301 fireWarningEvent(message, item, 4);
302 }
303 }
304 return name;
305 }
306
307
308 private Reference<?> handleReference(CsvStreamItem item, List<MappedCdmBase> resultList, TermUri idTerm, TermUri strTerm) {
309
310 Reference result = null;
311 if (exists(idTerm, item) || exists(strTerm, item)){
312 String nomRefId = CdmUtils.Nz(item.get(idTerm)).trim();
313 String nomRefStr = CdmUtils.Nz(item.get(strTerm)).trim();
314 if (StringUtils.isNotBlank(nomRefId)){
315 List<Reference> nomRefs = state.get(idTerm.toString(), nomRefId, Reference.class);
316 if (nomRefs.size() == 0){
317 //references should already exist in store if not linking to external links like URLs
318 String message = "External namePublishedInIDs are not yet supported";
319 fireWarningEvent(message, item, 4);
320 }else{
321 //TODO handle list.size > 1 , do we need a list here ?
322 result = nomRefs.get(0);
323 }
324 }
325 if (result == null){
326 List<Reference> nomRefs = state.get(strTerm.toString(), nomRefStr, Reference.class);
327 if (nomRefs.size() > 0){
328 //TODO handle list.size > 1 , do we need a list here ?
329 result = nomRefs.get(0);
330 }else{
331 // new Reference
332 result = ReferenceFactory.newGeneric(); //TODO handle other types if possible
333 result.setTitleCache(nomRefStr, true);
334 //TODO distinguish available year, authorship, etc. if
335 resultList.add(new MappedCdmBase(strTerm, nomRefStr, result));
336 }
337 }
338 }
339 return result;
340 }
341
342
343 //TODO we may configure in configuration that scientific name never includes Authorship
344 private void checkAuthorship(TaxonNameBase nameBase, CsvStreamItem item) {
345 if (!nameBase.isInstanceOf(NonViralName.class)){
346 return;
347 }
348 NonViralName<?> nvName = CdmBase.deproxy(nameBase, NonViralName.class);
349 String strAuthors = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_AUTHORS);
350
351 if (! nvName.isProtectedTitleCache()){
352 if (StringUtils.isBlank(nvName.getAuthorshipCache())){
353 //TODO some more sophisticated stuff can be done here like parsing etc.
354 nvName.setAuthorshipCache(strAuthors);
355 //TODO warning (scientific name should always include authorship)
356 }
357 }
358
359 }
360
361
362 private Rank getRank(CsvStreamItem csvTaxonRecord, NomenclaturalCode nomCode) {
363 boolean USE_UNKNOWN = true;
364 Rank rank = null;
365 String strRank = getValue(csvTaxonRecord,TermUri.DWC_TAXON_RANK);
366 String strVerbatimRank = getValue(csvTaxonRecord,TermUri.DWC_VERBATIM_TAXON_RANK);
367 if (strRank != null){
368 try {
369 rank = Rank.getRankByEnglishName(strRank, nomCode, USE_UNKNOWN);
370 if (rank.equals(Rank.UNKNOWN_RANK())){
371 rank = Rank.getRankByNameOrAbbreviation(strRank, USE_UNKNOWN);
372 if (rank.equals(Rank.UNKNOWN_RANK())){
373 String message = "Rank can not be defined for '%s'";
374 message = String.format(message, strRank);
375 fireWarningEvent(message, csvTaxonRecord, 4);
376 }
377 }
378 } catch (UnknownCdmTypeException e) {
379 //should not happen as USE_UNKNOWN is used
380 rank = Rank.UNKNOWN_RANK();
381 }
382 }
383 if ( (rank == null || rank.equals(Rank.UNKNOWN_RANK())) && strVerbatimRank != null){
384 try {
385 rank = Rank.getRankByNameOrAbbreviation(strVerbatimRank, USE_UNKNOWN);
386 if (rank.equals(Rank.UNKNOWN_RANK())){
387 String message = "Rank can not be defined for '%s'";
388 message = String.format(message, strVerbatimRank);
389 fireWarningEvent(message, csvTaxonRecord, 4);
390 }
391 } catch (UnknownCdmTypeException e) {
392 //should not happen as USE_UNKNOWN is used
393 rank = Rank.UNKNOWN_RANK();
394 }
395 }
396 return rank;
397 }
398
399
400 private TaxonBase<?> getTaxonBase(CsvStreamItem item) {
401 TaxonNameBase<?,?> name = null;
402 Reference<?> sec = null;
403 TaxonBase<?> result;
404 String taxStatus = item.get(TermUri.DWC_TAXONOMIC_STATUS);
405 String status = "";
406 boolean isMissaplied = false;
407 if (taxStatus != null){
408 if (taxStatus.matches("accepted|valid")){
409 status += "A";
410 }else if (taxStatus.matches(".*synonym|invalid")){
411 status += "S";
412 }if (taxStatus.matches("misapplied")){
413 status += "M";
414 }else{
415 status += "?";
416 }
417 item.remove(TermUri.DWC_TAXONOMIC_STATUS);
418 }
419 if (! CdmUtils.isBlank(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
420 // acceptedNameUsageId = id
421 if (getSourceId(item).equals(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
422 status += "A";
423 }else{
424 status += "S";
425 }
426 }
427 if (status.contains("A") || status.contains("M")){
428 result = Taxon.NewInstance(name, sec);
429 if (status.contains("S") && ! status.contains("M") ){
430 String message = "Ambigous taxon status (%s)";
431 message = String.format(message, status);
432 fireWarningEvent(message, item, 6);
433 }
434 }else if (status.contains("S")){
435 result = Synonym.NewInstance(name, sec);
436 }else{
437 result = Taxon.NewUnknownStatusInstance(name, sec);
438 }
439
440 return result;
441
442 }
443
444 // ********************** PARTITIONABLE ****************************************/
445
446
447 @Override
448 protected void makeForeignKeysForItem(CsvStreamItem item, Map<String, Set<String>> fkMap) {
449 String value;
450 String key;
451
452 //namePublishedIn
453 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN_ID.toString()))){
454 Set<String> keySet = getKeySet(key, fkMap);
455 keySet.add(value);
456 }
457 if (state.getConfig().isDeduplicateNamePublishedIn()){
458 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN.toString()))){
459 Set<String> keySet = getKeySet(key, fkMap);
460 keySet.add(value);
461 }
462 }
463
464 //nameAccordingTo
465 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO_ID.toString()))){
466 Set<String> keySet = getKeySet(key, fkMap);
467 keySet.add(value);
468 }
469 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO.toString()))){
470 Set<String> keySet = getKeySet(key, fkMap);
471 keySet.add(value);
472 }
473
474 }
475
476 //** ***************************** TO STRING *********************************************/
477
478 @Override
479 public String toString(){
480 return this.getClass().getName();
481 }
482
483
484
485 }