cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/dwca/in/DwcTaxonCsv2CdmTaxonConverter.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2009 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.io.dwca.in;
  11
  12 import java.util.ArrayList;
  13 import java.util.List;
  14 import java.util.Map;
  15 import java.util.Set;
  16
  17 import org.apache.commons.lang.StringUtils;
  18 import org.apache.log4j.Logger;
  19
  20 import com.ibm.lsid.MalformedLSIDException;
  21
  22 import eu.etaxonomy.cdm.common.CdmUtils;
  23 import eu.etaxonomy.cdm.io.dwca.TermUri;
  24 import eu.etaxonomy.cdm.model.common.CdmBase;
  25 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
  26 import eu.etaxonomy.cdm.model.common.LSID;
  27 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
  28 import eu.etaxonomy.cdm.model.name.NonViralName;
  29 import eu.etaxonomy.cdm.model.name.Rank;
  30 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
  31 import eu.etaxonomy.cdm.model.reference.Reference;
  32 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  33 import eu.etaxonomy.cdm.model.taxon.Classification;
  34 import eu.etaxonomy.cdm.model.taxon.Synonym;
  35 import eu.etaxonomy.cdm.model.taxon.Taxon;
  36 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  37 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
  38 import eu.etaxonomy.cdm.strategy.parser.INonViralNameParser;
  39 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  40
  41 /**
  42  * @author a.mueller
  43  * @date 22.11.2011
  44  *
  45  */
  46 public class DwcTaxonCsv2CdmTaxonConverter extends PartitionableConverterBase<DwcaImportState> implements IPartitionableConverter<CsvStreamItem, IReader<CdmBase>, String>{
  47         @SuppressWarnings("unused")
  48         private static Logger logger = Logger.getLogger(DwcTaxonCsv2CdmTaxonConverter.class);
  49
  50         private static final String ID = "id";
  51         // key for for case that no dataset information is supplied, TODO use something better
  52         public static final String NO_DATASET = "no_dataset_jli773oebhjklw";
  53
  54
  55         /**
  56          * @param state
  57          */
  58         public DwcTaxonCsv2CdmTaxonConverter(DwcaImportState state) {
  59                 super();
  60                 this.state = state;
  61         }
  62
  63
  64         public IReader<MappedCdmBase> map(CsvStreamItem csvTaxonRecord){
  65                 List<MappedCdmBase> resultList = new ArrayList<MappedCdmBase>();
  66
  67                 Reference<?> sourceReference = null;
  68                 String sourceReferenceDetail = null;
  69
  70                 //taxon
  71                 TaxonBase<?> taxonBase = getTaxonBase(csvTaxonRecord);
  72                 MappedCdmBase  mcb = new MappedCdmBase(csvTaxonRecord.term, csvTaxonRecord.get(ID), taxonBase);
  73                 resultList.add(mcb);
  74
  75                 //original source
  76                 String id = csvTaxonRecord.get(ID);
  77                 IdentifiableSource source = taxonBase.addSource(id, "Taxon", sourceReference, sourceReferenceDetail);
  78                 MappedCdmBase mappedSource = new MappedCdmBase(csvTaxonRecord.get(ID), source);
  79                 resultList.add(mappedSource);
  80                 csvTaxonRecord.remove(ID);
  81
  82                 //rank
  83                 NomenclaturalCode nomCode = getNomCode(csvTaxonRecord);
  84                 Rank rank = getRank(csvTaxonRecord, nomCode);
  85
  86                 //name
  87                 TaxonNameBase<?,?> name = getScientificName(csvTaxonRecord, nomCode, rank, resultList);
  88                 taxonBase.setName(name);
  89
  90                 //sec
  91                 Reference<?> sec = getNameAccordingTo(csvTaxonRecord, resultList);
  92                 taxonBase.setSec(sec);
  93
  94                 //classification
  95                 handleDataset(csvTaxonRecord, resultList, sourceReference, sourceReferenceDetail);
  96
  97                 //NON core
  98             //term="http://purl.org/dc/terms/identifier"
  99                 //currently only LSIDs
 100                 handleIdentifier(csvTaxonRecord, taxonBase);
 101
 102
 103
 104                 //                  <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
 105 //                       The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
 106 //                       Fungi, Plantae, Protozoa, Viruses -->
 107 //                  <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
 108
 109 //                  <!-- Phylum in which the taxon has been classified -->
 110 //                  <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
 111
 112                 //                  <!-- Class in which the taxon has been classified -->
 113 //                  <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
 114
 115                 //                  <!-- Order in which the taxon has been classified -->
 116 //                  <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
 117
 118                 //                  <!-- Family in which the taxon has been classified -->
 119 //                  <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
 120
 121                 //                  <!-- Genus in which the taxon has been classified -->
 122 //                  <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
 123
 124                 //                  <!-- Subgenus in which the taxon has been classified -->
 125 //                  <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
 126 //                  <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
 127
 128 //                  <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
 129 //                  <!-- Infraspecific epithet -->
 130
 131 //                  <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
 132 //                  <!-- Authorship -->
 133
 134 //                  <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
 135 //              ==> see scientific name
 136 //
 137 //              <!-- Acceptance status published in -->
 138 //                  <field index='20' term='http://purl.org/dc/terms/source'/>
 139 //                  <!-- Reference in which the scientific name was first published -->
 140 //                  <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
 141 //                  <!-- Taxon scrutinized by -->
 142 //                  <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/>
 143 //                  <!-- Scrutiny date -->
 144 //                  <field index='23' term='http://purl.org/dc/terms/modified'/>
 145 //                  <!-- Additional data for the taxon -->
 146 //                  <field index='24' term='http://purl.org/dc/terms/description'/>
 147 //                  </core>
 148
 149                 return new ListReader<MappedCdmBase>(resultList);
 150         }
 151
 152
 153
 154         //TODO handle non LSIDs
 155         //TODO handle LSIDs for names
 156         private void handleIdentifier(CsvStreamItem csvTaxonRecord, TaxonBase<?> taxonBase) {
 157                 String identifier = csvTaxonRecord.get(TermUri.DC_IDENTIFIER);
 158                 if (StringUtils.isNotBlank(identifier)){
 159                         if (identifier.trim().startsWith("urn:lsid")){
 160                                 try {
 161                                         LSID lsid = new LSID(identifier);
 162                                         taxonBase.setLsid(lsid);
 163                                 } catch (MalformedLSIDException e) {
 164                                         String message = "LSID is malformed and can't be handled as LSID: %s";
 165                                         message = String.format(message, identifier);
 166                                         fireWarningEvent(message, csvTaxonRecord, 4);
 167                                 }
 168                         }else{
 169                                 String message = "Identifier type not supported: %s";
 170                                 message = String.format(message, identifier);
 171                                 fireWarningEvent(message, csvTaxonRecord, 4);
 172                         }
 173                 }
 174
 175         }
 176
 177
 178         private void handleDataset(CsvStreamItem csvTaxonRecord, List<MappedCdmBase> resultList, Reference<?> sourceReference, String sourceReferecenDetail) {
 179                 String datasetId = CdmUtils.Nz(csvTaxonRecord.get(TermUri.DWC_DATASET_ID)).trim();
 180                 String datasetName = CdmUtils.Nz(csvTaxonRecord.get(TermUri.DWC_DATASET_NAME)).trim();
 181                 if (CdmUtils.areBlank(datasetId, datasetName) ){
 182                         datasetId = NO_DATASET;
 183                 }
 184
 185                 //check id
 186                 boolean classificationExists = state.exists(TermUri.DWC_DATASET_ID.toString() , datasetId, Classification.class);
 187
 188                 //check name
 189                 if (!classificationExists){
 190                         classificationExists = state.exists(TermUri.DWC_DATASET_NAME.toString() , datasetName, Classification.class);
 191                 }
 192
 193                 //if not exists, create new
 194                 if (! classificationExists){
 195                         String classificationName = StringUtils.isBlank(datasetName)? datasetId : datasetName;
 196                         if (classificationName.equals(NO_DATASET)){
 197                                 classificationName = "Classification (no name)";  //TODO define by config or zipfile or metadata
 198                         }
 199
 200                         String classificationId = StringUtils.isBlank(datasetId)? datasetName : datasetId;
 201                         Classification classification = Classification.NewInstance(classificationName);
 202                         //source
 203                         IdentifiableSource source = classification.addSource(classificationId, "Dataset", sourceReference, sourceReferecenDetail);
 204                         //add to result
 205                         resultList.add(new MappedCdmBase(TermUri.DWC_DATASET_ID, datasetId, classification));
 206                         resultList.add(new MappedCdmBase(TermUri.DWC_DATASET_NAME, datasetName, classification));
 207                         resultList.add(new MappedCdmBase(source));
 208                         //TODO this is not so nice but currently necessary as classifications are requested in the same partition
 209                         state.putMapping(TermUri.DWC_DATASET_ID.toString(), classificationId, classification);
 210                         state.putMapping(TermUri.DWC_DATASET_NAME.toString(), classificationName, classification);
 211                 }
 212
 213                 //remove to later check if all attributes were used
 214                 csvTaxonRecord.remove(TermUri.DWC_DATASET_ID);
 215                 csvTaxonRecord.remove(TermUri.DWC_DATASET_NAME);
 216
 217         }
 218
 219
 220         @Override
 221         public String getSourceId(CsvStreamItem item) {
 222                 String id = item.get(ID);
 223                 return id;
 224         }
 225
 226         private Reference<?> getNameAccordingTo(CsvStreamItem item, List<MappedCdmBase> resultList) {
 227                 TermUri idTerm = TermUri.DWC_NAME_ACCORDING_TO_ID;
 228                 TermUri strTerm = TermUri.DWC_NAME_ACCORDING_TO;
 229                 Reference<?> secRef = handleReference(item, resultList, idTerm, strTerm);
 230                 return secRef;
 231
 232         }
 233
 234         private NomenclaturalCode getNomCode(CsvStreamItem item) {
 235                 String strNomCode = getValue(item, TermUri.DWC_NOMENCLATURAL_CODE);
 236                 NomenclaturalCode nomCode = null;
 237                 // by Nomcenclatural Code
 238                 if (strNomCode != null){
 239                         nomCode = NomenclaturalCode.fromString(strNomCode);
 240                         if (nomCode == null){
 241                                 String message = "NomCode '%s' not recognized";
 242                                 message = String.format(message, strNomCode);
 243                                 fireWarningEvent(message, item, 4);
 244                         }else{
 245                                 return nomCode;
 246                         }
 247                 }
 248                 // by Kingdom
 249                 String strKingdom = getValue(item, TermUri.DWC_KINGDOM);
 250                 if (strKingdom.equalsIgnoreCase("Plantae")){
 251                         nomCode = NomenclaturalCode.ICBN;
 252                 }else if (strKingdom.equalsIgnoreCase("Animalia")){
 253                         nomCode = NomenclaturalCode.ICZN;
 254                 }else if (strKingdom.equalsIgnoreCase("Fungi")){
 255                         nomCode = NomenclaturalCode.ICBN;
 256                 }
 257                 //TODO further kingdoms
 258                 if (nomCode == null){
 259                         //TODO warning
 260                 }
 261                 return nomCode;
 262         }
 263
 264
 265         private TaxonNameBase<?,?> getScientificName(CsvStreamItem item, NomenclaturalCode nomCode, Rank rank, List<MappedCdmBase> resultList) {
 266                 TaxonNameBase<?,?> name = null;
 267                 String strScientificName = getValue(item, TermUri.DWC_SCIENTIFIC_NAME);
 268                 //Name
 269                 if (strScientificName != null){
 270                         INonViralNameParser<?> parser = NonViralNameParserImpl.NewInstance();
 271                         name = parser.parseFullName(strScientificName, nomCode, rank);
 272                         if (rank != null && name != null && name.getRank() != null &&
 273                                         ! rank.equals(name.getRank())){
 274                                 String message = "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
 275                                 message = String.format(message, name.getRank().getTitleCache(), strScientificName, rank.getTitleCache());
 276                                 fireWarningEvent(message, item, 4);
 277                         }
 278                         checkAuthorship(name, item);
 279                         resultList.add(new MappedCdmBase(TermUri.DWC_SCIENTIFIC_NAME, strScientificName, name));
 280                 }
 281                 //By ID
 282                 String strScientificNameId = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_ID);
 283                 if (strScientificNameId != null){
 284                         String message = "ScientificNameId not yet implemented: '%s'";
 285                         message = String.format(message, strScientificNameId);
 286                         fireWarningEvent(message, item, 4);
 287                 }
 288
 289                 //namePublishedIn
 290                 TermUri idTerm = TermUri.DWC_NAME_PUBLISHED_IN_ID;
 291                 TermUri strTerm = TermUri.DWC_NAME_PUBLISHED_IN;
 292                 Reference<?> nomRef = handleReference(item, resultList, idTerm, strTerm);
 293
 294                 if (name != null){
 295                         if (nomRef != null){
 296                                 name.setNomenclaturalReference(nomRef);  //check if name already has a nomRef, shouldn't be the case usually
 297                         }
 298                 }else{
 299                         if (nomRef != null){
 300                                 String message = "NamePublishedIn information available but no name exists";
 301                                 fireWarningEvent(message, item, 4);
 302                         }
 303                 }
 304                 return name;
 305         }
 306
 307
 308         private Reference<?> handleReference(CsvStreamItem item, List<MappedCdmBase> resultList, TermUri idTerm, TermUri strTerm) {
 309
 310                 Reference result = null;
 311                 if (exists(idTerm, item) || exists(strTerm, item)){
 312                         String nomRefId = CdmUtils.Nz(item.get(idTerm)).trim();
 313                         String nomRefStr = CdmUtils.Nz(item.get(strTerm)).trim();
 314                         if (StringUtils.isNotBlank(nomRefId)){
 315                                 List<Reference> nomRefs = state.get(idTerm.toString(), nomRefId, Reference.class);
 316                                 if (nomRefs.size() == 0){
 317                                         //references should already exist in store if not linking to external links like URLs
 318                                         String message = "External namePublishedInIDs are not yet supported";
 319                                         fireWarningEvent(message, item, 4);
 320                                 }else{
 321                                         //TODO handle list.size > 1 , do we need a list here ?
 322                                         result = nomRefs.get(0);
 323                                 }
 324                         }
 325                         if (result == null){
 326                                 List<Reference> nomRefs = state.get(strTerm.toString(), nomRefStr, Reference.class);
 327                                 if (nomRefs.size() > 0){
 328                                         //TODO handle list.size > 1 , do we need a list here ?
 329                                         result = nomRefs.get(0);
 330                                 }else{
 331                                         // new Reference
 332                                         result = ReferenceFactory.newGeneric();  //TODO handle other types if possible
 333                                         result.setTitleCache(nomRefStr, true);
 334                                         //TODO distinguish available year, authorship, etc. if
 335                                         resultList.add(new MappedCdmBase(strTerm, nomRefStr, result));
 336                                 }
 337                         }
 338                 }
 339                 return result;
 340         }
 341
 342
 343         //TODO we may configure in configuration that scientific name never includes Authorship
 344         private void checkAuthorship(TaxonNameBase nameBase, CsvStreamItem item) {
 345                 if (!nameBase.isInstanceOf(NonViralName.class)){
 346                         return;
 347                 }
 348                 NonViralName<?> nvName = CdmBase.deproxy(nameBase, NonViralName.class);
 349                 String strAuthors = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_AUTHORS);
 350
 351                 if (! nvName.isProtectedTitleCache()){
 352                         if (StringUtils.isBlank(nvName.getAuthorshipCache())){
 353                                 //TODO some more sophisticated stuff can be done here like parsing etc.
 354                                 nvName.setAuthorshipCache(strAuthors);
 355                                 //TODO warning (scientific name should always include authorship)
 356                         }
 357                 }
 358
 359         }
 360
 361
 362         private Rank getRank(CsvStreamItem csvTaxonRecord, NomenclaturalCode nomCode) {
 363                 boolean USE_UNKNOWN = true;
 364                 Rank rank = null;
 365                 String strRank = getValue(csvTaxonRecord,TermUri.DWC_TAXON_RANK);
 366                 String strVerbatimRank = getValue(csvTaxonRecord,TermUri.DWC_VERBATIM_TAXON_RANK);
 367                 if (strRank != null){
 368                         try {
 369                                 rank = Rank.getRankByEnglishName(strRank, nomCode, USE_UNKNOWN);
 370                                 if (rank.equals(Rank.UNKNOWN_RANK())){
 371                                         rank = Rank.getRankByNameOrAbbreviation(strRank, USE_UNKNOWN);
 372                                         if (rank.equals(Rank.UNKNOWN_RANK())){
 373                                                 String message = "Rank can not be defined for '%s'";
 374                                                 message = String.format(message, strRank);
 375                                                 fireWarningEvent(message, csvTaxonRecord, 4);
 376                                         }
 377                                 }
 378                         } catch (UnknownCdmTypeException e) {
 379                                 //should not happen as USE_UNKNOWN is used
 380                                 rank = Rank.UNKNOWN_RANK();
 381                         }
 382                 }
 383                 if ( (rank == null || rank.equals(Rank.UNKNOWN_RANK())) && strVerbatimRank != null){
 384                         try {
 385                                 rank = Rank.getRankByNameOrAbbreviation(strVerbatimRank, USE_UNKNOWN);
 386                                 if (rank.equals(Rank.UNKNOWN_RANK())){
 387                                         String message = "Rank can not be defined for '%s'";
 388                                         message = String.format(message, strVerbatimRank);
 389                                         fireWarningEvent(message, csvTaxonRecord, 4);
 390                                 }
 391                         } catch (UnknownCdmTypeException e) {
 392                                 //should not happen as USE_UNKNOWN is used
 393                                 rank = Rank.UNKNOWN_RANK();
 394                         }
 395                 }
 396                 return rank;
 397         }
 398
 399
 400         private TaxonBase<?> getTaxonBase(CsvStreamItem item) {
 401                 TaxonNameBase<?,?> name = null;
 402                 Reference<?> sec = null;
 403                 TaxonBase<?> result;
 404                 String taxStatus = item.get(TermUri.DWC_TAXONOMIC_STATUS);
 405                 String status = "";
 406                 boolean isMissaplied = false;
 407                 if (taxStatus != null){
 408                         if (taxStatus.matches("accepted|valid")){
 409                                 status += "A";
 410                         }else if (taxStatus.matches(".*synonym|invalid")){
 411                                 status += "S";
 412                         }if (taxStatus.matches("misapplied")){
 413                                 status += "M";
 414                         }else{
 415                                 status += "?";
 416                         }
 417                         item.remove(TermUri.DWC_TAXONOMIC_STATUS);
 418                 }
 419                 if (! CdmUtils.isBlank(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
 420                         // acceptedNameUsageId = id
 421                         if (getSourceId(item).equals(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
 422                                 status += "A";
 423                         }else{
 424                                 status += "S";
 425                         }
 426                 }
 427                 if (status.contains("A") || status.contains("M")){
 428                         result = Taxon.NewInstance(name, sec);
 429                         if (status.contains("S") && ! status.contains("M") ){
 430                                 String message = "Ambigous taxon status (%s)";
 431                                 message = String.format(message, status);
 432                                 fireWarningEvent(message, item, 6);
 433                         }
 434                 }else if (status.contains("S")){
 435                         result = Synonym.NewInstance(name, sec);
 436                 }else{
 437                         result = Taxon.NewUnknownStatusInstance(name, sec);
 438                 }
 439
 440                 return result;
 441
 442         }
 443
 444 // ********************** PARTITIONABLE ****************************************/
 445
 446
 447         @Override
 448         protected void makeForeignKeysForItem(CsvStreamItem item, Map<String, Set<String>> fkMap) {
 449                 String value;
 450                 String key;
 451
 452                 //namePublishedIn
 453                 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN_ID.toString()))){
 454                         Set<String> keySet = getKeySet(key, fkMap);
 455                         keySet.add(value);
 456                 }
 457                 if (state.getConfig().isDeduplicateNamePublishedIn()){
 458                         if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN.toString()))){
 459                                 Set<String> keySet = getKeySet(key, fkMap);
 460                                 keySet.add(value);
 461                         }
 462                 }
 463
 464                 //nameAccordingTo
 465                 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO_ID.toString()))){
 466                         Set<String> keySet = getKeySet(key, fkMap);
 467                         keySet.add(value);
 468                 }
 469                 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO.toString()))){
 470                         Set<String> keySet = getKeySet(key, fkMap);
 471                         keySet.add(value);
 472                 }
 473
 474         }
 475
 476 //** ***************************** TO STRING *********************************************/
 477
 478         @Override
 479         public String toString(){
 480                 return this.getClass().getName();
 481         }
 482
 483
 484
 485 }