cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/dwca/in/DwcTaxonStreamItem2CdmTaxonConverter.java

   1 /**
   2 * Copyright (C) 2009 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9 package eu.etaxonomy.cdm.io.dwca.in;
  10
  11 import java.net.URI;
  12 import java.util.ArrayList;
  13 import java.util.HashSet;
  14 import java.util.List;
  15 import java.util.Map;
  16 import java.util.Set;
  17 import java.util.UUID;
  18
  19 import org.apache.commons.lang.StringUtils;
  20 import org.apache.log4j.Logger;
  21
  22 import com.ibm.lsid.MalformedLSIDException;
  23
  24 import eu.etaxonomy.cdm.common.CdmUtils;
  25 import eu.etaxonomy.cdm.io.common.mapping.UndefinedTransformerMethodException;
  26 import eu.etaxonomy.cdm.io.stream.IPartitionableConverter;
  27 import eu.etaxonomy.cdm.io.stream.IReader;
  28 import eu.etaxonomy.cdm.io.stream.ItemFilter;
  29 import eu.etaxonomy.cdm.io.stream.ListReader;
  30 import eu.etaxonomy.cdm.io.stream.MappedCdmBase;
  31 import eu.etaxonomy.cdm.io.stream.PartitionableConverterBase;
  32 import eu.etaxonomy.cdm.io.stream.StreamImportBase;
  33 import eu.etaxonomy.cdm.io.stream.StreamImportStateBase;
  34 import eu.etaxonomy.cdm.io.stream.StreamItem;
  35 import eu.etaxonomy.cdm.io.stream.terms.TermUri;
  36 import eu.etaxonomy.cdm.model.common.Annotation;
  37 import eu.etaxonomy.cdm.model.common.CdmBase;
  38 import eu.etaxonomy.cdm.model.common.Extension;
  39 import eu.etaxonomy.cdm.model.common.ExtensionType;
  40 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
  41 import eu.etaxonomy.cdm.model.common.Identifier;
  42 import eu.etaxonomy.cdm.model.common.LSID;
  43 import eu.etaxonomy.cdm.model.common.Language;
  44 import eu.etaxonomy.cdm.model.common.Marker;
  45 import eu.etaxonomy.cdm.model.common.MarkerType;
  46 import eu.etaxonomy.cdm.model.description.CommonTaxonName;
  47 import eu.etaxonomy.cdm.model.description.Distribution;
  48 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
  49 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  50 import eu.etaxonomy.cdm.model.location.NamedArea;
  51 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
  52 import eu.etaxonomy.cdm.model.name.Rank;
  53 import eu.etaxonomy.cdm.model.name.TaxonName;
  54 import eu.etaxonomy.cdm.model.reference.OriginalSourceType;
  55 import eu.etaxonomy.cdm.model.reference.Reference;
  56 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  57 import eu.etaxonomy.cdm.model.taxon.Classification;
  58 import eu.etaxonomy.cdm.model.taxon.Synonym;
  59 import eu.etaxonomy.cdm.model.taxon.Taxon;
  60 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  61 import eu.etaxonomy.cdm.model.term.DefinedTerm;
  62 import eu.etaxonomy.cdm.model.term.DefinedTermBase;
  63 import eu.etaxonomy.cdm.strategy.exceptions.StringNotParsableException;
  64 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
  65 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
  66
  67 /**
  68  * @author a.mueller
  69  * @since 22.11.2011
  70  */
  71 public class  DwcTaxonStreamItem2CdmTaxonConverter<CONFIG extends DwcaDataImportConfiguratorBase, STATE extends StreamImportStateBase<CONFIG, StreamImportBase>>
  72         extends PartitionableConverterBase<CONFIG, STATE>
  73         implements IPartitionableConverter<StreamItem, IReader<CdmBase>, String>, ItemFilter<StreamItem> {
  74
  75     private static final Logger logger = Logger.getLogger(DwcTaxonStreamItem2CdmTaxonConverter.class);
  76
  77     //if this converter is used as filter we may not want to delete item parts during evaluation
  78     boolean isFilterOnly = false;
  79
  80     private static final String ID = "id";
  81         // temporary key for the case that no dataset information is supplied, TODO use something better
  82         public static final String NO_DATASET = "no_dataset_jli773oebhjklw";
  83
  84         private final NonViralNameParserImpl parser = NonViralNameParserImpl.NewInstance();
  85
  86         public DwcTaxonStreamItem2CdmTaxonConverter(STATE state) {
  87                 super(state);
  88         }
  89
  90     public DwcTaxonStreamItem2CdmTaxonConverter(STATE state, boolean isFilter) {
  91         super(state);
  92         this.isFilterOnly = isFilter;
  93     }
  94
  95     @Override
  96     public boolean toBeRemovedFromStream(StreamItem item) {
  97         if (!config.isDoSplitRelationshipImport()){
  98             return false;
  99         }else{
 100             if (isSynonym(item)){
 101                 return ! this.config.isDoSynonymRelationships();
 102             }else{
 103                 NomenclaturalCode nomCode = getNomCode(item);
 104                 Rank rank = getRank(item, nomCode);
 105                 boolean isHigherRank = rank == null || rank.isHigher(Rank.SPECIES());
 106                 if (isHigherRank){
 107                     return ! config.isDoHigherRankRelationships();
 108                 }else{
 109                     return ! config.isDoLowerRankRelationships();
 110                 }
 111             }
 112         }
 113     }
 114
 115     private boolean isSynonym(StreamItem item) {
 116         TaxonBase<?> taxonBase = getTaxonBase(item);
 117         return taxonBase instanceof Synonym;
 118     }
 119
 120         @Override
 121     public IReader<MappedCdmBase<? extends CdmBase>> map(StreamItem csvTaxonRecord){
 122                 List<MappedCdmBase<? extends CdmBase>> resultList = new ArrayList<>();
 123
 124                 //TODO what if not transactional?
 125                 Reference sourceReference = state.getTransactionalSourceReference();
 126                 String sourceReferenceDetail = null;
 127
 128                 //taxon
 129                 TaxonBase<?> taxonBase = getTaxonBase(csvTaxonRecord);
 130                 MappedCdmBase<TaxonBase<?>>  mcb = new MappedCdmBase<>(csvTaxonRecord.term, csvTaxonRecord.get(ID), taxonBase);
 131                 resultList.add(mcb);
 132
 133                 //original source
 134                 String id = csvTaxonRecord.get(ID);
 135                 IdentifiableSource source = taxonBase.addSource(OriginalSourceType.Import, id, "Taxon", sourceReference, sourceReferenceDetail);
 136                 MappedCdmBase<IdentifiableSource> mappedSource = new MappedCdmBase<>(csvTaxonRecord.get(ID), source);
 137                 resultList.add(mappedSource);
 138                 csvTaxonRecord.remove(ID);
 139
 140                 //rank
 141                 NomenclaturalCode nomCode = getNomCode(csvTaxonRecord);
 142                 Rank rank = getRank(csvTaxonRecord, nomCode);
 143
 144                 //name && name published in
 145                 TaxonName name = getScientificName(csvTaxonRecord, nomCode, rank, resultList, sourceReference);
 146                 taxonBase.setName(name);
 147
 148                 //nameAccordingTo
 149                 MappedCdmBase<Reference> sec = getNameAccordingTo(csvTaxonRecord, resultList);
 150
 151                 if (sec == null && state.getConfig().isUseSourceReferenceAsSec()){
 152                         sec = new MappedCdmBase<>(state.getTransactionalSourceReference());
 153                 }
 154                 if (sec != null){
 155                         taxonBase.setSec(sec.getCdmBase());
 156                 }
 157
 158                 //classification
 159                 handleDataset(csvTaxonRecord, taxonBase, resultList, sourceReference, sourceReferenceDetail);
 160
 161                 //NON core
 162             //term="http://purl.org/dc/terms/identifier"
 163                 //currently only LSIDs or generic
 164                 handleIdentifier(csvTaxonRecord, taxonBase);
 165
 166                 //TaxonRemarks
 167                 handleTaxonRemarks(csvTaxonRecord, taxonBase);
 168
 169                 //TDWG_1
 170                 handleTdwgArea(csvTaxonRecord, taxonBase);
 171
 172                 //VernecularName
 173                 handleCommonNames(csvTaxonRecord, taxonBase);
 174
 175                 //External Sources, ID's and References
 176                 handleIdentifiableObjects(csvTaxonRecord, taxonBase);
 177
 178
 179                 //                  <!-- Top level group; listed as kingdom but may be interpreted as domain or superkingdom
 180 //                       The following eight groups are recognized: Animalia, Archaea, Bacteria, Chromista,
 181 //                       Fungi, Plantae, Protozoa, Viruses -->
 182 //                  <field index='10' term='http://rs.tdwg.org/dwc/terms/kingdom'/>
 183
 184 //                  <!-- Phylum in which the taxon has been classified -->
 185 //                  <field index='11' term='http://rs.tdwg.org/dwc/terms/phylum'/>
 186
 187                 //                  <!-- Class in which the taxon has been classified -->
 188 //                  <field index='12' term='http://rs.tdwg.org/dwc/terms/class'/>
 189
 190                 //                  <!-- Order in which the taxon has been classified -->
 191 //                  <field index='13' term='http://rs.tdwg.org/dwc/terms/order'/>
 192
 193                 //                  <!-- Family in which the taxon has been classified -->
 194 //                  <field index='14' term='http://rs.tdwg.org/dwc/terms/family'/>
 195
 196                 //                  <!-- Genus in which the taxon has been classified -->
 197 //                  <field index='15' term='http://rs.tdwg.org/dwc/terms/genus'/>
 198
 199                 //                  <!-- Subgenus in which the taxon has been classified -->
 200 //                  <field index='16' term='http://rs.tdwg.org/dwc/terms/subgenus'/>
 201 //                  <!-- Specific epithet; for hybrids, the multiplication symbol is included in the epithet -->
 202
 203 //                  <field index='17' term='http://rs.tdwg.org/dwc/terms/specificEpithet'/>
 204 //                  <!-- Infraspecific epithet -->
 205
 206 //                  <field index='18' term='http://rs.tdwg.org/dwc/terms/infraspecificEpithet'/>
 207 //                  <!-- Authorship -->
 208
 209 //                  <field index='19' term='http://rs.tdwg.org/dwc/terms/scientificNameAuthorship'/>
 210 //              ==> see scientific name
 211 //
 212 //              <!-- Acceptance status published in -->
 213 //                  <field index='20' term='http://purl.org/dc/terms/source'/>
 214 //                  <!-- Reference in which the scientific name was first published -->
 215 //                  <field index='21' term='http://rs.tdwg.org/dwc/terms/namePublishedIn'/>
 216 //                  <!-- Taxon scrutinized by -->
 217 //                  <field index='22' term='http://rs.tdwg.org/dwc/terms/nameAccordingTo'/>
 218 //                  <!-- Scrutiny date -->
 219 //                  <field index='23' term='http://purl.org/dc/terms/modified'/>
 220 //                  <!-- Additional data for the taxon -->
 221 //                  <field index='24' term='http://purl.org/dc/terms/description'/>
 222 //                  </core>
 223
 224                 handleModified(csvTaxonRecord, taxonBase);
 225
 226                 handleIsExtinct(csvTaxonRecord, taxonBase);
 227
 228
 229
 230                 return new ListReader<>(resultList);
 231         }
 232
 233
 234
 235     /**
 236      * @param csvTaxonRecord
 237      * @param taxonBase
 238      */
 239     private void handleIsExtinct(StreamItem item, TaxonBase<?> taxonBase) {
 240         String isExtinctStr = item.get(TermUri.GBIF_IS_EXTINCT);
 241         if (isBlank(isExtinctStr)){
 242             return;
 243         }
 244         Boolean isExtinct = getBoolean(isExtinctStr, item);
 245         if (isExtinct != null){
 246             try {
 247                 UUID isExtinctUuid = state.getTransformer().getMarkerTypeUuid("isExtinct");
 248                 MarkerType markerType = state.getCurrentIO().getMarkerType(state, isExtinctUuid, "extinct", "extinct", "extinct");
 249                 Marker.NewInstance(taxonBase, isExtinct, markerType);
 250
 251             } catch (UndefinedTransformerMethodException e) {
 252                 String message = "GetMarkerType not available for import. This should not happen. Please conntact developer";
 253                 fireWarningEvent(message, item.getLocation(), 8);
 254             }
 255         }
 256
 257     }
 258
 259     /**
 260      * @param item
 261      * @param isExtinctStr
 262      * @return
 263      */
 264     private Boolean getBoolean(String booleanStr, StreamItem item) {
 265         try {
 266             return Boolean.valueOf(booleanStr);
 267         } catch (Exception e) {
 268             String message = "Boolean value could not be parsed";
 269             fireWarningEvent(message, item, 4);
 270             return null;
 271         }
 272     }
 273
 274
 275
 276     /**
 277      * @param csvTaxonRecord
 278      * @param taxonBase
 279      */
 280     private void handleModified(StreamItem item, TaxonBase<?> taxonBase) {
 281         String modifiedStr = item.get(TermUri.DC_MODIFIED);
 282         if (isBlank(modifiedStr)){
 283             return;
 284         }
 285
 286         try {
 287             UUID modifiedUuid = state.getTransformer().getExtensionTypeUuid("modified");
 288             ExtensionType extensionType = state.getCurrentIO().getExtensionType(state, modifiedUuid, "modified", "modified", "modified");
 289             Extension.NewInstance(taxonBase, modifiedStr, extensionType);
 290
 291         } catch (UndefinedTransformerMethodException e) {
 292             String message = "GetMarkerType not available for import. This should not happen. Please conntact developer";
 293             fireWarningEvent(message, item.getLocation(), 8);
 294         }
 295
 296
 297     }
 298
 299     /**
 300          * @param item
 301          * @param taxonBase
 302          */
 303         private void handleIdentifiableObjects(StreamItem item,TaxonBase<?> taxonBase) {
 304
 305                 String references = item.get(TermUri.DC_REFERENCES);
 306
 307                 if (references == null || references == "") {
 308                         references = item.get(TermUri.DWC_NAME_PUBLISHED_IN_ID);//lorna temporary until Scratchpads move the reference to the correct place.
 309                 }
 310
 311                 if (StringUtils.isNotBlank(references)){
 312                         URI uri = makeUriIfIs(references);
 313                         if (uri != null){
 314                                 Extension.NewInstance(taxonBase, references, ExtensionType.URL());
 315                         }else{
 316                                 String message = "Non-URI Dublin Core References not yet handled for taxa. References is: %s";
 317                                 fireWarningEvent(String.format(message, references), item, 6);
 318                         }
 319                 }
 320
 321
 322                 //TODO: Finish properly
 323                 String id = item.get(TermUri.CDM_SOURCE_IDINSOURCE);
 324                 String idNamespace = item.get(TermUri.CDM_SOURCE_IDNAMESPACE);
 325                 String reference = item.get(TermUri.CDM_SOURCE_REFERENCE);
 326                 if(StringUtils.isNotBlank(id) && StringUtils.isNotBlank(idNamespace) && StringUtils.isNotBlank(reference)){
 327                         Reference ref = ReferenceFactory.newGeneric();
 328                         ref.setTitle(reference);
 329                         Taxon taxon = (Taxon) taxonBase;
 330                         taxon.addSource(OriginalSourceType.Import, id, idNamespace, ref, null);
 331                 }
 332
 333         }
 334
 335
 336         /**
 337          * If str is an uri it returns is as an {@link URI}. If not it returns <code>null</code>.
 338          * @param str
 339          * @return the URI.
 340          */
 341         private URI makeUriIfIs(String str) {
 342                 if (! str.startsWith("http:")){
 343                         return null;
 344                 }else{
 345                         try {
 346                                 URI uri = URI.create(str);
 347                                 return uri;
 348                         } catch (Exception e) {
 349                                 return null;
 350                         }
 351                 }
 352
 353         }
 354
 355
 356         /**
 357          * @param item
 358          * @param taxonBase
 359          */
 360         private void handleCommonNames(StreamItem item,TaxonBase<?> taxonBase) {
 361                 //TODO: handle comma separated values
 362                 String commonName = item.get(TermUri.DWC_VERNACULAR_NAME);
 363                 if (StringUtils.isNotBlank(commonName)){
 364
 365                         Language language = getLanguage(item);
 366                         CommonTaxonName commonTaxonName = CommonTaxonName.NewInstance(commonName, language);
 367                         if(taxonBase instanceof Taxon){
 368                                 Taxon taxon = (Taxon) taxonBase;
 369                                 TaxonDescription taxonDescription = getTaxonDescription(taxon, false);
 370                                 taxonDescription.addElement(commonTaxonName);
 371                                 logger.info("Common name " + commonName + " added to " + taxon.getTitleCache());
 372                         }
 373                 }
 374         }
 375
 376
 377
 378         /**
 379          * @param csvTaxonRecord
 380          * @param taxonBase
 381          */
 382         private void handleTdwgArea(StreamItem item, TaxonBase<?> taxonBase) {
 383                 String tdwg_area = item.get(TermUri.DWC_COUNTRY_CODE);
 384                 if (tdwg_area != null){
 385                 if(taxonBase instanceof Synonym){
 386                         Synonym synonym = CdmBase.deproxy(taxonBase, Synonym.class);
 387                         Taxon acceptedTaxon = synonym.getAcceptedTaxon();
 388                         if (acceptedTaxon != null){
 389                             TaxonDescription td = getTaxonDescription(acceptedTaxon, false);
 390                             NamedArea area = NamedArea.getAreaByTdwgAbbreviation(tdwg_area);
 391
 392                             if (area == null){
 393                                 area = NamedArea.getAreaByTdwgLabel(tdwg_area);
 394                             }
 395                             if (area != null){
 396                                 Distribution distribution = Distribution.NewInstance(area, PresenceAbsenceTerm.PRESENT());
 397                                 td.addElement(distribution);
 398                             }
 399                         }
 400                 }
 401                 if(!(taxonBase instanceof Synonym)){
 402                         Taxon taxon = CdmBase.deproxy(taxonBase, Taxon.class);
 403                         TaxonDescription td = getTaxonDescription(taxon, false);
 404                         NamedArea area = NamedArea.getAreaByTdwgAbbreviation(tdwg_area);
 405
 406                         if (area == null){
 407                                 area = NamedArea.getAreaByTdwgLabel(tdwg_area);
 408                         }
 409                         if (area != null){
 410                                 Distribution distribution = Distribution.NewInstance(area, PresenceAbsenceTerm.PRESENT());
 411                                 td.addElement(distribution);
 412                         }
 413                 }
 414         }
 415         }
 416
 417
 418         /**
 419          * @param item
 420          * @param taxonBase
 421          */
 422         private void handleTaxonRemarks(StreamItem item,TaxonBase<?> taxonBase) {
 423                 String comment = item.get(TermUri.DWC_TAXON_REMARKS);
 424                 Language language = getLanguage(item);
 425                 if(StringUtils.isNotBlank(comment)){
 426                                 Annotation annotation = Annotation.NewInstance(comment, language);
 427                                 taxonBase.addAnnotation(annotation);
 428                 }else{
 429 //                      String message = "Comment is empty or some error appeared while saving: %s";
 430 ////                    message = String.format(message);
 431 //                      fireWarningEvent(message, item, 1);
 432                 }
 433         }
 434
 435
 436         //TODO handle non LSIDs
 437         //TODO handle LSIDs for names
 438         private void handleIdentifier(StreamItem csvTaxonRecord, TaxonBase<?> taxonBase) {
 439                 String identifier = csvTaxonRecord.get(TermUri.DC_IDENTIFIER);
 440                 if (StringUtils.isNotBlank(identifier)){
 441                         if (identifier.trim().startsWith("urn:lsid")){
 442                                 try {
 443                                         LSID lsid = new LSID(identifier);
 444                                         taxonBase.setLsid(lsid);
 445                                 } catch (MalformedLSIDException e) {
 446                                         String message = "LSID is malformed and can't be handled as LSID: %s";
 447                                         message = String.format(message, identifier);
 448                                         fireWarningEvent(message, csvTaxonRecord, 4);
 449                                         Identifier.NewInstance(taxonBase, identifier, DefinedTermBase.getTermByClassAndUUID(DefinedTerm.class, DefinedTerm.uuidLsid));
 450                                 }
 451                         }else{
 452                                 Identifier.NewInstance(taxonBase, identifier, null);
 453                             String message = "Identifier type not recognized. Create generic identifier: %s";
 454                                 message = String.format(message, identifier);
 455                                 fireWarningEvent(message, csvTaxonRecord, 1);
 456                         }
 457                 }
 458
 459         }
 460
 461
 462         private void handleDataset(StreamItem item, TaxonBase<?> taxonBase,
 463                 List<MappedCdmBase<? extends CdmBase>> resultList,
 464                 Reference sourceReference,
 465                 String sourceReferecenDetail) {
 466
 467                 TermUri idTerm = TermUri.DWC_DATASET_ID;
 468                 TermUri strTerm = TermUri.DWC_DATASET_NAME;
 469
 470                 if (config.isDatasetsAsClassifications()){
 471                         String datasetId = CdmUtils.Nz(item.get(idTerm)).trim();
 472                         String datasetName = CdmUtils.Nz(item.get(strTerm)).trim();
 473                                 if (CdmUtils.areBlank(datasetId, datasetName) ){
 474                                 datasetId = NO_DATASET;
 475                         }
 476
 477                         //check id
 478                         boolean classificationExists = state.exists(idTerm.toString() , datasetId, Classification.class);
 479
 480                         //check name
 481                         if (!classificationExists){
 482                                 classificationExists = state.exists(strTerm.toString() , datasetName, Classification.class);
 483                         }
 484
 485                         //if not exists, create new
 486                         if (! classificationExists){
 487                                 String classificationName = StringUtils.isBlank(datasetName)? datasetId : datasetName;
 488                                 if (classificationName.equals(NO_DATASET)){
 489                                         classificationName = config.getClassificationName();
 490                                         //classificationName = "Classification (no name)";  //TODO define by config or zipfile or metadata
 491                                 }
 492
 493                                 String classificationId = StringUtils.isBlank(datasetId)? datasetName : datasetId;
 494                                 Classification classification = Classification.NewInstance(classificationName);
 495                                 //source
 496                                 IdentifiableSource source = classification.addSource(OriginalSourceType.Import, classificationId, "Dataset", sourceReference, sourceReferecenDetail);
 497                                 //add to result
 498                                 resultList.add(new MappedCdmBase<>(idTerm, datasetId, classification));
 499                                 resultList.add(new MappedCdmBase<>(strTerm, datasetName, classification));
 500                                 resultList.add(new MappedCdmBase<>(source));
 501                                 //TODO this is not so nice but currently necessary as classifications are requested in the same partition
 502                                 state.putMapping(idTerm.toString(), classificationId, classification);
 503                                 state.putMapping(strTerm.toString(), classificationName, classification);
 504                         }
 505                 }else if (config.isDatasetsAsSecundumReference() || config.isDatasetsAsOriginalSource()){
 506                         MappedCdmBase<Reference> mappedCitation = getReference(item, resultList, idTerm, strTerm, true);
 507                         if (mappedCitation != null){
 508                                 Reference ref = mappedCitation.getCdmBase();
 509                                 if (config.isDatasetsAsSecundumReference()){
 510                                         //dataset as secundum reference
 511                                         taxonBase.setSec(ref);
 512                                 }else{
 513                                         //dataset as original source
 514                                         taxonBase.addSource(OriginalSourceType.Import, null, null, ref, null);
 515                                 }
 516                         }
 517                 }else{
 518                         String message = "DatasetUse type not yet implemented. Can't import dataset information.";
 519                         fireWarningEvent(message, item, 4);
 520                 }
 521
 522                 //remove to later check if all attributes were used
 523                 removeItemInfo(item, idTerm);
 524                 removeItemInfo(item, strTerm);
 525         }
 526
 527
 528         @Override
 529         public String getSourceId(StreamItem item) {
 530                 String id = item.get(ID);
 531                 return id;
 532         }
 533
 534         private MappedCdmBase<Reference> getNameAccordingTo(StreamItem item, List<MappedCdmBase<? extends CdmBase>> resultList) {
 535                 if (config.isDatasetsAsSecundumReference()){
 536                         //TODO store nameAccordingTo info some where else or let the user define where to store it.
 537                         return null;
 538                 }else{
 539                         TermUri idTerm = TermUri.DWC_NAME_ACCORDING_TO_ID;
 540                         TermUri strTerm = TermUri.DWC_NAME_ACCORDING_TO;
 541                         MappedCdmBase<Reference> secRef = getReference(item, resultList, idTerm, strTerm, false);
 542                         return secRef;
 543                 }
 544         }
 545
 546         private NomenclaturalCode getNomCode(StreamItem item) {
 547                 String strNomCode = getValue(item, TermUri.DWC_NOMENCLATURAL_CODE);
 548                 NomenclaturalCode nomCode = null;
 549                 // by Nomcenclatural Code
 550                 if (strNomCode != null){
 551                         nomCode = NomenclaturalCode.fromString(strNomCode);
 552                         if (nomCode == null){
 553                                 String message = "NomCode '%s' not recognized";
 554                                 message = String.format(message, strNomCode);
 555                                 fireWarningEvent(message, item, 4);
 556                         }else{
 557                                 return nomCode;
 558                         }
 559                 }
 560                 // by Kingdom
 561                 String strKingdom = getValue(item, TermUri.DWC_KINGDOM);
 562                 if (strKingdom != null){
 563                         if (strKingdom.equalsIgnoreCase("Plantae")){
 564                                 nomCode = NomenclaturalCode.ICNAFP;
 565                         }else if (strKingdom.equalsIgnoreCase("Fungi")){
 566                                 nomCode = NomenclaturalCode.ICNAFP;
 567                         }else if (strKingdom.equalsIgnoreCase("Animalia")){
 568                                 nomCode = NomenclaturalCode.ICZN;
 569                         }else if (strKingdom.equalsIgnoreCase("Protozoa")){
 570                                 nomCode = NomenclaturalCode.ICZN;
 571                         }
 572                 }
 573
 574                 //TODO further kingdoms
 575                 if (nomCode == null){
 576                         //TODO warning
 577                         if (config.getNomenclaturalCode() != null){
 578                                 nomCode = config.getNomenclaturalCode();
 579                         }
 580                 }
 581                 return nomCode;
 582         }
 583
 584
 585         private TaxonName getScientificName(StreamItem item, NomenclaturalCode nomCode, Rank rank, List<MappedCdmBase<? extends CdmBase>> resultList, Reference sourceReference) {
 586                 TaxonName name = null;
 587                 String strScientificName = getValue(item, TermUri.DWC_SCIENTIFIC_NAME);
 588                 //Name
 589                 if (strScientificName != null){
 590                         name = (TaxonName)parser.parseFullName(strScientificName, nomCode, rank);
 591                         if ( rank != null && name != null && name.getRank() != null &&  ! rank.equals(name.getRank())){
 592                                 if (config.isValidateRankConsistency()){
 593                                         String message = "Parsed rank %s (%s) differs from rank %s given by fields 'taxonRank' or 'verbatimTaxonRank'";
 594                                         message = String.format(message, name.getRank().getTitleCache(), strScientificName, rank.getTitleCache());
 595                                         fireWarningEvent(message, item, 4);
 596                                 }
 597                         }
 598                         checkAuthorship(name, item);
 599                         resultList.add(new MappedCdmBase(TermUri.DWC_SCIENTIFIC_NAME, strScientificName, name));
 600                 }
 601                 //By ID
 602                 String strScientificNameId = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_ID);
 603                 if (strScientificNameId != null){
 604                         if (config.isScientificNameIdAsOriginalSourceId()){
 605                                 if (name != null){
 606                                         IdentifiableSource source = IdentifiableSource.NewInstance(OriginalSourceType.Import, strScientificNameId, TermUri.DWC_SCIENTIFIC_NAME_ID.toString(), sourceReference, null);
 607                                         name.addSource(source);
 608                                 }
 609                         }else{
 610                                 String message = "ScientificNameId not yet implemented: '%s'";
 611                                 message = String.format(message, strScientificNameId);
 612                                 fireWarningEvent(message, item, 4);
 613                         }
 614                 }
 615
 616                 //namePublishedIn
 617                 TermUri idTerm = TermUri.DWC_NAME_PUBLISHED_IN_ID;
 618                 TermUri strTerm = TermUri.DWC_NAME_PUBLISHED_IN;
 619                 MappedCdmBase<Reference> nomRef = getReference(item, resultList, idTerm, strTerm, false);
 620
 621                 if (name != null){
 622                         if (nomRef != null){
 623                                 name.setNomenclaturalReference(nomRef.getCdmBase());  //check if name already has a nomRef, shouldn't be the case usually
 624                         }
 625                 }else{
 626                         if (nomRef != null){
 627                                 String message = "NamePublishedIn information available but no name exists";
 628                                 fireWarningEvent(message, item, 4);
 629                         }
 630                 }
 631                 return name;
 632         }
 633
 634
 635         /**
 636          * General method to handle references used for multiple attributes.
 637          * @param item
 638          * @param resultList
 639          * @param idTerm
 640          * @param strTerm
 641          * @param idIsInternal
 642          * @return
 643          */
 644         private MappedCdmBase<Reference> getReference(StreamItem item,
 645                 List<MappedCdmBase<? extends CdmBase>> resultList, TermUri idTerm,
 646                 TermUri strTerm, boolean idIsInternal) {
 647                 Reference newRef = null;
 648                 Reference sourceCitation = null;
 649
 650                 MappedCdmBase<Reference> result = null;
 651                 if (exists(idTerm, item) || exists(strTerm, item)){
 652                         String refId = CdmUtils.Nz(item.get(idTerm)).trim();
 653                         String refStr = CdmUtils.Nz(item.get(strTerm)).trim();
 654                         if (StringUtils.isNotBlank(refId)){
 655                                 List<Reference> references = state.get(idTerm.toString(), refId, Reference.class);
 656                                 if (references.size() == 0){
 657                                         if (! idIsInternal){
 658                                                 //references should already exist in store if not linking to external links like URLs
 659                                                 String message = "External namePublishedInIDs are not yet supported";
 660                                                 fireWarningEvent(message, item, 4);//set to DEBUG
 661                                         }else{
 662                                                 newRef = ReferenceFactory.newGeneric();  //TODO handle other types if possible
 663                                                 newRef.addSource(OriginalSourceType.Import, refId, idTerm.toString(), sourceCitation, null);
 664                                                 MappedCdmBase<Reference> idResult = new MappedCdmBase<>(idTerm, refId, newRef);
 665                                                 resultList.add(idResult);
 666                                         }
 667                                 }else{
 668                                         //TODO handle list.size > 1 , do we need a list here ?
 669                                         result = new MappedCdmBase<Reference>(idTerm, refId , references.get(0));
 670                                 }
 671                         }
 672                         if (result == null){
 673                                 List<Reference> nomRefs = state.get(strTerm.toString(), refStr, Reference.class);
 674                                 if (nomRefs.size() > 0){
 675                                         //TODO handle list.size > 1 , do we need a list here ?
 676                                         result = new MappedCdmBase<>(strTerm, refStr , nomRefs.get(0));
 677                                 }else{
 678                                         // new Reference
 679                                         if (newRef == null){
 680                                                 newRef = ReferenceFactory.newGeneric();  //TODO handle other types if possible
 681                                         }
 682                                         newRef.setTitleCache(refStr, true);
 683                                         //TODO distinguish available year, authorship, etc. if
 684                                         result = new MappedCdmBase<>(strTerm, refStr, newRef);
 685                                         resultList.add(result);
 686                                 }
 687                         }
 688                 }
 689                 return result;
 690         }
 691
 692
 693         //TODO we may configure in configuration that scientific name never includes Authorship
 694         private void checkAuthorship(TaxonName nameBase, StreamItem item) {
 695                 if (nameBase.isViral()){
 696                         return;
 697                 }
 698                 String strAuthors = getValue(item, TermUri.DWC_SCIENTIFIC_NAME_AUTHORS);
 699
 700                 if (! nameBase.isProtectedTitleCache()){
 701                         if (isBlank(nameBase.getAuthorshipCache())){
 702                                 if (nameBase.isBotanical() || nameBase.isZoological()){
 703                                         //TODO can't we also parse NonViralNames correctly ?
 704                                         try {
 705                                                 parser.parseAuthors(nameBase, strAuthors);
 706                                         } catch (StringNotParsableException e) {
 707                                             nameBase.setAuthorshipCache(strAuthors);
 708                                         }
 709                                 }else{
 710                                     nameBase.setAuthorshipCache(strAuthors);
 711                                 }
 712                                 //TODO throw warning (scientific name should always include authorship) by DwC definition
 713                         }
 714                 }
 715
 716         }
 717
 718
 719         private Rank getRank(StreamItem csvTaxonRecord, NomenclaturalCode nomCode) {
 720                 boolean USE_UNKNOWN = true;
 721                 Rank rank = null;
 722                 String strRank = getValue(csvTaxonRecord,TermUri.DWC_TAXON_RANK);
 723                 String strVerbatimRank = getValue(csvTaxonRecord,TermUri.DWC_VERBATIM_TAXON_RANK);
 724                 if (strRank != null){
 725                         try {
 726                                 rank = Rank.getRankByEnglishName(strRank, nomCode, USE_UNKNOWN);
 727                                 if (rank.equals(Rank.UNKNOWN_RANK())){
 728                                         rank = Rank.getRankByNameOrIdInVoc(strRank, USE_UNKNOWN);
 729                                         if (rank.equals(Rank.UNKNOWN_RANK())){
 730                                                 String message = "Rank can not be defined for '%s'";
 731                                                 message = String.format(message, strRank);
 732                                                 fireWarningEvent(message, csvTaxonRecord, 4);
 733                                         }
 734                                 }
 735                         } catch (UnknownCdmTypeException e) {
 736                                 //should not happen as USE_UNKNOWN is used
 737                                 rank = Rank.UNKNOWN_RANK();
 738                         }
 739                 }
 740                 if ( (rank == null || rank.equals(Rank.UNKNOWN_RANK())) && strVerbatimRank != null){
 741                         try {
 742                                 rank = Rank.getRankByNameOrIdInVoc(strVerbatimRank, USE_UNKNOWN);
 743                                 if (rank.equals(Rank.UNKNOWN_RANK())){
 744                                         String message = "Rank can not be defined for '%s'";
 745                                         message = String.format(message, strVerbatimRank);
 746                                         fireWarningEvent(message, csvTaxonRecord, 4);
 747                                 }
 748                         } catch (UnknownCdmTypeException e) {
 749                                 //should not happen as USE_UNKNOWN is used
 750                                 rank = Rank.UNKNOWN_RANK();
 751                         }
 752                 }
 753                 return rank;
 754         }
 755
 756
 757         /**
 758          * Creates an empty taxon object with a given status.
 759          * <i>Empty</i> taxon means, without a defined name or sec.
 760          * @param item
 761          * @return
 762          */
 763         private TaxonBase<?> getTaxonBase(StreamItem item) {
 764                 TaxonName name = null;
 765                 Reference sec = null;
 766                 TaxonBase<?> result;
 767                 String taxStatus = item.get(TermUri.DWC_TAXONOMIC_STATUS);
 768                 String status = "";
 769
 770                 if (taxStatus != null){
 771                         if (taxStatus.matches("accepted.*|valid")){
 772                                 status += "A";
 773                         } else if (taxStatus.matches(".*synonym|invalid|not accepted")){   //not accepted comes from scratchpads
 774                                 status += "S";
 775                         } else if (taxStatus.matches("misapplied.*")){
 776                                 status += "M";
 777                         } else{
 778                                 status += "?";
 779                         }
 780                         removeItemInfo(item, TermUri.DWC_TAXONOMIC_STATUS);
 781                 }
 782                 if (! CdmUtils.isBlank(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
 783                         // acceptedNameUsageId = id
 784                         if (getSourceId(item).equals(item.get(TermUri.DWC_ACCEPTED_NAME_USAGE_ID))){
 785                                 status += "A";
 786                         }else{
 787                                 status += "S";
 788                         }
 789                 }
 790                 if (status.contains("A") || status.contains("M")){
 791                         result = Taxon.NewInstance(name, sec);
 792                         if (status.contains("S") && ! status.contains("M") ){
 793                                 String message = "Ambigous taxon status (%s)";
 794                                 message = String.format(message, status);
 795                                 fireWarningEvent(message, item, 6);
 796                         }
 797                 } else if (status.contains("S")){
 798                         result = Synonym.NewInstance(name, sec);
 799                 } else{
 800                         result = Taxon.NewUnknownStatusInstance(name, sec);
 801                 }
 802
 803                 return result;
 804
 805         }
 806
 807
 808
 809     /**
 810          * @param item
 811          * @return
 812          */
 813         private Language getLanguage(StreamItem item) {
 814                 String langItem = item.get(TermUri.DC_LANGUAGE);
 815                 Language language = null;
 816
 817                 if(StringUtils.equalsIgnoreCase(langItem, "de")){
 818                         language = Language.GERMAN();
 819                 }else if(StringUtils.equalsIgnoreCase(langItem, "en")){
 820                         language = Language.ENGLISH();
 821                 }else{
 822                         language = Language.DEFAULT();
 823                 }
 824                 return language;
 825         }
 826
 827 // ********************** PARTITIONABLE ****************************************/
 828
 829
 830         @Override
 831         protected void makeForeignKeysForItem(StreamItem item, Map<String, Set<String>> fkMap) {
 832                 String value;
 833                 String key;
 834
 835                 //namePublishedIn
 836                 if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN_ID.toString()))){
 837                         Set<String> keySet = getKeySet(key, fkMap);
 838                         keySet.add(value);
 839                 }
 840                 if (config.isDeduplicateNamePublishedIn()){
 841                         if ( hasValue(value = item.get(key = TermUri.DWC_NAME_PUBLISHED_IN.toString()))){
 842                                 Set<String> keySet = getKeySet(key, fkMap);
 843                                 keySet.add(value);
 844                         }
 845                 }
 846
 847                 //nameAccordingTo
 848                 if (! config.isDatasetsAsSecundumReference()){
 849                         if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO_ID.toString()))){
 850                                 Set<String> keySet = getKeySet(key, fkMap);
 851                                 keySet.add(value);
 852                         }
 853                         if ( hasValue(value = item.get(key = TermUri.DWC_NAME_ACCORDING_TO.toString()))){
 854                                 Set<String> keySet = getKeySet(key, fkMap);
 855                                 keySet.add(value);
 856                         }
 857                 }
 858
 859                 //dataset
 860                 if ( hasValue(value = item.get(key = TermUri.DWC_DATASET_ID.toString()))){
 861                         Set<String> keySet = getKeySet(key, fkMap);
 862                         keySet.add(value);
 863                 }
 864                 if ( hasValue(value = item.get(key = TermUri.DWC_DATASET_NAME.toString()))){
 865                         Set<String> keySet = getKeySet(key, fkMap);
 866                         keySet.add(value);
 867                 }
 868
 869         }
 870
 871
 872         @Override
 873         public Set<String> requiredSourceNamespaces() {
 874                 Set<String> result = new HashSet<>();
 875                 result.add(TermUri.DWC_NAME_PUBLISHED_IN_ID.toString());
 876                 result.add(TermUri.DWC_NAME_PUBLISHED_IN.toString());
 877                 if (!config.isDatasetsAsSecundumReference()){
 878                         result.add(TermUri.DWC_NAME_ACCORDING_TO_ID.toString());
 879                         result.add(TermUri.DWC_NAME_ACCORDING_TO.toString());
 880                 }
 881                 result.add(TermUri.DWC_DATASET_ID.toString());
 882                 result.add(TermUri.DWC_DATASET_NAME.toString());
 883                 return result;
 884         }
 885
 886
 887     /**
 888      * @param item
 889      * @param dwcTaxonomicStatus
 890      */
 891     private void removeItemInfo(StreamItem item, TermUri dwcTaxonomicStatus) {
 892         if (!isFilterOnly){
 893             item.remove(dwcTaxonomicStatus);
 894         }
 895     }
 896
 897
 898 //** ***************************** TO STRING *********************************************/
 899
 900         @Override
 901         public String toString(){
 902                 return this.getClass().getName();
 903         }
 904 }