Project

General

Profile

« Previous | Next » 

Revision 14295b4b

Added by Andreas Müller over 5 years ago

ref #7798 implement and improve occurrence.nameUsedInSource import for freetext and referenced names

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/berlinModel/in/BerlinModelOccurrenceSourceImport.java
13 13
import java.sql.SQLException;
14 14
import java.util.HashMap;
15 15
import java.util.HashSet;
16
import java.util.List;
16 17
import java.util.Map;
17 18
import java.util.Set;
18 19

  
......
26 27
import eu.etaxonomy.cdm.io.common.Source;
27 28
import eu.etaxonomy.cdm.model.common.CdmBase;
28 29
import eu.etaxonomy.cdm.model.common.OriginalSourceType;
30
import eu.etaxonomy.cdm.model.common.RelationshipBase.Direction;
29 31
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
30 32
import eu.etaxonomy.cdm.model.description.DescriptionElementSource;
31 33
import eu.etaxonomy.cdm.model.description.Distribution;
32 34
import eu.etaxonomy.cdm.model.description.TaxonDescription;
35
import eu.etaxonomy.cdm.model.name.NameRelationshipType;
33 36
import eu.etaxonomy.cdm.model.name.TaxonName;
34 37
import eu.etaxonomy.cdm.model.reference.Reference;
38
import eu.etaxonomy.cdm.model.taxon.Synonym;
35 39
import eu.etaxonomy.cdm.model.taxon.Taxon;
40
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
41
import eu.etaxonomy.cdm.model.taxon.TaxonNode;
36 42

  
37 43

  
38 44
/**
......
42 48
@Component
43 49
public class BerlinModelOccurrenceSourceImport  extends BerlinModelImportBase {
44 50

  
51
    /**
52
     *
53
     */
54
    private static final String EXACT = "(exact) ";
55

  
45 56
    private static final long serialVersionUID = 1139543760239436841L;
46 57

  
47 58
    private static final Logger logger = Logger.getLogger(BerlinModelOccurrenceSourceImport.class);
......
94 105
		}
95 106
		super.doInvoke(state);
96 107
		sourceNumberRefIdMap = null;
108
		nameCache2NameIdMap = null;
97 109
		if (notFoundReferences.size()>0){
98 110
			String unfound = "'" + CdmUtils.concat("','", notFoundReferences.toArray(new String[]{})) + "'";
99 111
			logger.warn("Not found references: " + unfound);
......
234 246
    private Set<String> handleOldNames(Set<String> oldNamesSet) {
235 247
        Set<String> oldNameIdSet = new HashSet<>();
236 248

  
237
        for(String oldName : oldNamesSet){
238
            if (isNotBlank(oldName)){
239
                Set<Integer> nameIds = nameCache2NameIdMap.get(oldName);
240
                for (Integer nameId : nameIds){
241
                    oldNameIdSet.add(String.valueOf(nameId));
249
        try {
250
            for(String oldName : oldNamesSet){
251
                if (isNotBlank(oldName)){
252
                    Set<Integer> nameIds = nameCache2NameIdMap.get(oldName);
253
                    if (nameIds != null){
254
                        for (Integer nameId : nameIds){
255
                            oldNameIdSet.add(String.valueOf(nameId));
256
                        }
257
                    }
242 258
                }
243 259
            }
260
        } catch (Exception e) {
261
            e.printStackTrace();
262
            logger.error("Exception in handleOldNames" + e.getMessage());
244 263
        }
245 264
        return oldNameIdSet;
246 265
    }
......
253 272
	 * @param oldNameFk
254 273
	 * @return
255 274
	 */
256
	boolean isFirstTimeNoNameByService = true;
257 275
	private TaxonName getName(BerlinModelImportState state, String oldName, Integer oldNameFk, Integer occSourceId, Distribution distribution) {
258 276
		TaxonName taxonName = (TaxonName)state.getRelatedObject(BerlinModelTaxonNameImport.NAMESPACE, String.valueOf(oldNameFk));
259 277
		if (oldNameFk != null && taxonName == null){
......
261 279
		}
262 280
		if (isNotBlank(oldName)){
263 281
		    if (taxonName == null){
264
		        if (isFirstTimeNoNameByService){
265
		            logger.warn("oldName not checked against names in BerlinModel. Just take it as a string");
266
		            isFirstTimeNoNameByService = false;
267
		        }
268
		        Set<TaxonName> names = getOldNames(state, oldName);
269
		        if (names.isEmpty()){
270
		            logger.warn("No name found for freetext oldName '"+oldName+"'; occSourceId: " + occSourceId);
271
		            //taxonName = nameParser.parseSimpleName(oldName);
272
		            return null;
273
		        }else {
274
		            if (names.size()> 1){
275
		                TaxonName synName = getFirstSynonymName(state, names, distribution, occSourceId);
276
		                if (synName == null){
277
		                    logger.warn("There is more than one matching oldName for '"+oldName+"' but none of them is a synonym of the accepted taxon. Take arbitrary one. OccSourceId: " + occSourceId);
278
		                    return names.iterator().next();
279
		                }else{
280
		                    return synName;
281
		                }
282
		            }else{
283
                        return names.iterator().next();
284
		            }
285
		        }
282
		        return handleOldFreetextNameOnly(state, oldName, occSourceId, distribution);
286 283
		    }else if (!oldName.equals(taxonName.getNameCache())){
287 284
		        logger.warn("Old name freetext and linked name nameCache are not equal: " + oldName + "/" + taxonName.getNameCache() + "; occSourceId: " +  occSourceId);
285
		        checkSynonymie(state, oldNameFk, occSourceId, distribution, taxonName);
288 286
	            return taxonName;
289 287
		    }else{
290
		        return taxonName;
288
		        checkSynonymie(state, oldNameFk, occSourceId, distribution, taxonName);
289
	            return taxonName;
290
		    }
291
		}else{ //taxonName != null
292
		    if (taxonName != null){
293
		        checkSynonymie(state, oldNameFk, occSourceId, distribution, taxonName);
291 294
		    }
292
		}else{
293 295
		    return taxonName;
294 296
		}
295 297
	}
296 298

  
299
    /**
300
     * @param state
301
     * @param oldName
302
     * @param occSourceId
303
     * @param distribution
304
     * @return
305
     */
306
    protected TaxonName handleOldFreetextNameOnly(BerlinModelImportState state, String oldName, Integer occSourceId,
307
            Distribution distribution) {
308
        Set<TaxonName> names = getOldNames(state, oldName);
309
        if (names.isEmpty()){
310
            if (getNameIds(oldName).isEmpty()){
311
                if (state.getConfig().isLogNotMatchingOldNames()){
312
                    logger.warn("No name found for freetext oldName '"+oldName+"'; occSourceId: " + occSourceId);
313
                }
314
            }else{
315
                if (state.getConfig().isLogMatchingNotExportedOldNames()){
316
                    logger.warn("Matching name exists in BM but not in CDM. OldName: " + oldName + "; Taxon: "+getTaxonStr(distribution)+"; occSourceId: " + occSourceId);
317
                }
318
            }
319
            return null;
320
        }else {
321
            TaxonName result = names.iterator().next();
322
            boolean checkOldNameIsSynonym = state.getConfig().isCheckOldNameIsSynonym();
323
            if (names.size()> 1){
324
                TaxonName synName = getFirstSynonymName(state, names, distribution, null, occSourceId, true);
325
                if (synName == null){
326
                    //TODO should we really use a name if not available in synonymy?
327
                    String message = "There is more than one matching oldName for '"+oldName+"' but none of them is a synonym of the accepted taxon '"+getTaxonStr(distribution)+"'.";
328
                    message += (!checkOldNameIsSynonym ? "Take arbitrary one. ":"") + "OccSourceId: " + occSourceId;
329
                    logger.warn(message);
330
                    return checkOldNameIsSynonym ? null : result;
331
                }else{
332
                    return synName;
333
                }
334
            }else{
335
                //names.size() = 1
336
                if (checkOldNameIsSynonym){
337
                    TaxonName synName = getFirstSynonymName(state, names, distribution, null, occSourceId, true);
338
                    if (synName == null){
339
                        if (state.getConfig().isCheckOldNameIsSynonym()){
340
//                            logger.warn("There is a matching oldName for '"+oldName+"' but it is not a synonym/misapplication of the accepted taxon '"+getTaxonStr(distribution)+"'. OccSourceId: " + occSourceId);
341
                            return null;
342
                        }else{
343
                            return result;
344
                        }
345
                    }else if (!synName.equals(result)){
346
                        //TODO strange, how can this happen if it is the only matching?
347
                        logger.warn("There is a matching oldName for '"+oldName+"'("+result.getUuid()+") but another matching name "+synName.getUuid()+"exists in the synonymy of the accepted taxon '"+getTaxonStr(distribution)+"'. OccSourceId: " + occSourceId);
348
                        return synName;
349
                    }else{
350
                        return result;
351
                    }
352
                }else{
353
                    return result;
354
                }
355
            }
356
        }
357
    }
358

  
359
    /**
360
     * @param state
361
     * @param oldNameFk
362
     * @param occSourceId
363
     * @param distribution
364
     * @param taxonName
365
     */
366
    protected void checkSynonymie(BerlinModelImportState state, Integer oldNameFk, Integer occSourceId,
367
            Distribution distribution, TaxonName taxonName) {
368
        if (state.getConfig().isCheckOldNameIsSynonym()){
369
            Set<TaxonName> names = new HashSet<>();
370
            names.add(taxonName);
371
            boolean hasTaxon = !taxonName.getTaxonBases().isEmpty();
372
            String orphaned = hasTaxon ? "" : "Orphaned name: ";
373
            TaxonName synName = getFirstSynonymName(state, names, distribution, null, occSourceId, false);
374
            if (synName == null){
375
                Set<TaxonName> existingNames = getOldNames(state, taxonName.getNameCache());
376
                existingNames.remove(taxonName);
377
                if (existingNames.isEmpty()){
378
                    logger.warn(orphaned + "NameInSource (" + oldNameFk + " - " +taxonName.getTitleCache() + ") could not be found in synonymy of "+getTaxonStr(distribution)+". OccSourceId: " + occSourceId);
379
                }else{
380
                    TaxonName existingSynonym = getFirstSynonymName(state, existingNames, distribution, null, occSourceId, false);
381
                    if (existingSynonym != null){
382
                        boolean isExact = CdmUtils.nullSafeEqual(existingSynonym.getTitleCache(),taxonName.getTitleCache());
383
                        String exact = isExact ? EXACT : "";
384
                        logger.warn(exact + orphaned + "A similar name ("+existingSynonym.getUuid()+") can be found in synonymy but is not the nameInSource (" + oldNameFk + " - " +taxonName.getTitleCache() + "); Taxon: "+getTaxonStr(distribution)+". OccSourceId: " + occSourceId);
385
                    }else{
386
                        TaxonName existingMisapplication = getFirstMisapplication(state, existingNames, distribution, occSourceId);
387
                        if (existingMisapplication != null){
388
                            boolean isExact = CdmUtils.nullSafeEqual(existingMisapplication.getTitleCache(),taxonName.getTitleCache());
389
                            String exact = isExact ? EXACT : "";
390

  
391
                            logger.warn(exact + orphaned + "A similar misapplied name ("+existingMisapplication.getUuid()+") can be found in misapplications but not is not the nameInSource (" + oldNameFk + " - " +taxonName.getTitleCache() + "); Taxon: "+getTaxonStr(distribution)+". OccSourceId: " + occSourceId);
392
                        }
393
                    }
394
                }
395
            }
396
        }
397
    }
398

  
297 399
	/**
298 400
     * @param state
299 401
     * @param names
402
	 * @param taxon
300 403
     * @param taxon
301 404
     * @return
302 405
     */
303
    private TaxonName getFirstSynonymName(BerlinModelImportState state, Set<TaxonName> names, Distribution distribution, Integer occSourceId) {
304
        Taxon taxon = CdmBase.deproxy(distribution.getInDescription(), TaxonDescription.class).getTaxon();
305
        Set<TaxonName> synonyms = taxon.getSynonymNames();
406
    private TaxonName getFirstSynonymName(BerlinModelImportState state, Set<TaxonName> names, Distribution distribution, Taxon taxon, Integer occSourceId, boolean includeMisapplications) {
407
        TaxonName result = null;
408
        taxon = taxon == null ? getTaxon(distribution): taxon;
409
        Set<Synonym> synonyms = taxon.getSynonyms();
410
        Set<TaxonName> synonymNames = new HashSet<>();
411

  
412
        synonymNames.add(taxon.getName());
413
        synonymNames.addAll(getOrthographicVariants(taxon));
414

  
415
        for (Synonym synonym : synonyms){
416
            synonymNames.add(synonym.getName());
417
            synonymNames.addAll(getOrthographicVariants(synonym));
418
        }
419
        for (TaxonName name : names){
420
            if (synonymNames.contains(name)){
421
                if (result != null){
422
                    logger.warn("There is more than 1 matching synonym/taxon for " + name.getNameCache() + "; occSourceId: " + occSourceId);
423
                }
424
                result = name;
425
            }
426
        }
427

  
428
        //parent
429
        if (result == null){
430
            if (taxon.getName().isInfraSpecific()){
431
                if (!taxon.getTaxonNodes().isEmpty()){
432
                    TaxonNode parent = taxon.getTaxonNodes().iterator().next().getParent();
433
                    if (parent != null && parent.getTaxon() != null){
434
                        Set<TaxonName> parentNames = new HashSet<>();
435
                        TaxonName parentName = parent.getTaxon().getName();
436
                        parentNames.add(parentName);
437
                        parentNames.addAll(getOrthographicVariants(parent.getTaxon()));
438

  
439
                        for (TaxonName name : names){
440
                            if (parentNames.contains(name)){
441
                                if (result != null){
442
                                    logger.warn("There is more than 1 matching parent for " + name.getNameCache() + "; occSourceId: " + occSourceId);
443
                                }
444
                                result = name;
445
                            }
446
                        }
447
                        if (result == null){
448
                            TaxonName parentSyn = getFirstSynonymName(state, names, distribution, parent.getTaxon(), occSourceId, includeMisapplications);
449
                            if (parentSyn != null){
450
                                result = parentSyn;
451
                            }
452
                        }
453
                    }
454
                }
455
            }
456
        }
457

  
458
        //child
459
        if (result == null){
460
            if (taxon.getName().isSpecies() || taxon.getName().isSupraSpecific()){
461
                if (!taxon.getTaxonNodes().isEmpty()){
462
                    List<TaxonNode> children = taxon.getTaxonNodes().iterator().next().getChildNodes();
463
                    Set<TaxonName> childNames = new HashSet<>();
464
                    for (TaxonNode child : children){
465
                        childNames.add(child.getTaxon().getName());
466
                        childNames.addAll(getOrthographicVariants(child.getTaxon()));
467
                    }
468
                    for (TaxonName name : names){
469
                        if (childNames.contains(name)){
470
                            if (result != null){
471
                                logger.warn("There is more than 1 matching child for " + name.getNameCache() + "; occSourceId: " + occSourceId);
472
                            }
473
                            result = name;
474
                        }
475
                    }
476
                }
477
            }
478
        }
479

  
480
        if (result == null && includeMisapplications){
481
            result = getFirstMisapplication(state, synonymNames, distribution, occSourceId);
482
        }
483

  
484
        return result;
485
    }
486

  
487
    /**
488
     * @param state
489
     * @param names
490
     * @param taxon
491
     * @return
492
     */
493
    private TaxonName getFirstMisapplication(BerlinModelImportState state, Set<TaxonName> names, Distribution distribution, Integer occSourceId) {
306 494
        TaxonName result = null;
495
        Taxon taxon = getTaxon(distribution);
496

  
497
        //MAN
498
        Set<Taxon> misappliedTaxa = taxon.getMisappliedNames(true);
499
        misappliedTaxa.addAll(taxon.getInvalidDesignations());
500
        Set<TaxonName> misappliedNames = new HashSet<>();
501
        for (Taxon misTaxon : misappliedTaxa){
502
            misappliedNames.add(misTaxon.getName());
503
            misappliedNames.addAll(getOrthographicVariants(misTaxon));
504
        }
505

  
307 506
        for (TaxonName name : names){
308
            if (synonyms.contains(name)){
507
            if (misappliedNames.contains(name)){
309 508
                if (result != null){
310
                    logger.warn("There is more than 1 matching synonym for " + name.getNameCache() + "; occSourceId: " + occSourceId);
509
                    logger.warn("There is more than 1 matching misapplied name or invalid designation for " + name.getNameCache() + "; occSourceId: " + occSourceId);
311 510
                }
312 511
                result = name;
313 512
            }
......
315 514
        return result;
316 515
    }
317 516

  
517
    /**
518
     * @param taxon
519
     * @return
520
     */
521
    protected Set<TaxonName> getOrthographicVariants(TaxonBase<?> taxonBase) {
522
        Set<TaxonName> result = taxonBase.getName().getRelatedNames(Direction.relatedTo, NameRelationshipType.ORTHOGRAPHIC_VARIANT());
523
        result.addAll(taxonBase.getName().getRelatedNames(Direction.relatedTo, NameRelationshipType.MISSPELLING()));
524
        result.addAll(taxonBase.getName().getRelatedNames(Direction.relatedTo, NameRelationshipType.ORIGINAL_SPELLING()));
525
        return result;
526
    }
527

  
528
    /**
529
     * @param distribution
530
     * @return
531
     */
532
    protected String getTaxonStr(Distribution distribution) {
533
        Taxon taxon = CdmBase.deproxy(distribution.getInDescription(), TaxonDescription.class).getTaxon();
534
        String areaStr = distribution.getArea().getIdInVocabulary();
535
        return areaStr + ": " + taxon.getName().getTitleCache();
536
    }
537

  
538
    protected Taxon getTaxon(Distribution distribution) {
539
        Taxon taxon = CdmBase.deproxy(distribution.getInDescription(), TaxonDescription.class).getTaxon();
540
        return taxon;
541
    }
542

  
318 543
    /**
319 544
     * @param state
320 545
     * @param oldName
321 546
     * @return
322 547
     */
323 548
    private Set<TaxonName> getOldNames(BerlinModelImportState state, String oldName) {
324
        Set<Integer> nameIds = nameCache2NameIdMap.get(oldName);
325
        Set<TaxonName> names = new HashSet<>(nameIds.size());
549
        Set<TaxonName> names = new HashSet<>();
550
        Set<Integer> nameIds = getNameIds(oldName);
326 551
        for (Integer id : nameIds){
327 552
            TaxonName name = (TaxonName)state.getRelatedObject(BerlinModelTaxonNameImport.NAMESPACE, String.valueOf(id));
328
            names.add(name);
553
            if (name != null){
554
                names.add(name);
555
            }
329 556
        }
330 557
        return names;
331 558
    }
332 559

  
560
    /**
561
     * @param oldName
562
     * @return
563
     */
564
    private Set<Integer> getNameIds(String oldName) {
565
        Set<Integer> result = nameCache2NameIdMap.get(oldName);
566
        return result == null ? new HashSet<>(): result;
567
    }
568

  
333 569
    /**
334 570
	 * Creates a map which maps source numbers on references
335 571
	 * @param state
......
367 603
     */
368 604
    private Map<String, Set<Integer>> makeNameCache2NameIdMap(BerlinModelImportState state) throws SQLException {
369 605
        Map<String, Set<Integer>> result = new HashMap<>();
606
        try {
607

  
608
            Source source = state.getConfig().getSource();
609
            String strQuery = " SELECT NameId, nameCache " +
610
                              " FROM Name " +
611
                              " WHERE (nameCache IS NOT NULL) AND (nameCache NOT LIKE '') ";
370 612

  
371
        Source source = state.getConfig().getSource();
372
        String strQuery = " SELECT NameId, nameCache " +
373
                          " FROM Name " +
374
                          " WHERE (nameCache IS NOT NULL) AND (nameCache NOT LIKE '') ";
375

  
376
        ResultSet rs = source.getResultSet(strQuery) ;
377
        while (rs.next()){
378
            int nameId = rs.getInt("NameId");
379
            String nameCache = rs.getString("nameCache");
380
            if (isNotBlank(nameCache)){
381
                nameCache = nameCache.trim();
382
                Set<Integer> set = result.get(nameCache);
383
                if (set == null){
384
                    set = new HashSet<>();
385
                    result.put(nameCache, set);
613
            ResultSet rs = source.getResultSet(strQuery) ;
614
            while (rs.next()){
615
                int nameId = rs.getInt("NameId");
616
                String nameCache = rs.getString("nameCache");
617
                if (isNotBlank(nameCache)){
618
                    nameCache = nameCache.trim();
619
                    Set<Integer> set = result.get(nameCache);
620
                    if (set == null){
621
                        set = new HashSet<>();
622
                        result.put(nameCache, set);
623
                    }
624
                    set.add(nameId);
386 625
                }
387
                set.add(nameId);
388 626
            }
627
        } catch (Exception e) {
628
            e.printStackTrace();
629
            logger.error("Exception in makeNameCache2NameIdMap" + e.getMessage());
389 630
        }
390 631
        return result;
391 632
    }
392 633

  
393

  
394

  
395 634
	@Override
396 635
	protected boolean doCheck(BerlinModelImportState state){
397 636
		IOValidator<BerlinModelImportState> validator = new BerlinModelOccurrenceSourceImportValidator();

Also available in: Unified diff