Project

General

Profile

« Previous | Next » 

Revision d5bca6b4

Added by Andreas Müller over 2 years ago

ref #9889 add direct matching on persistence to deduplication helper and replace for reference.author and .inReference

View differences:

cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/common/utils/ImportDeduplicationHelper.java
32 32
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
33 33
import eu.etaxonomy.cdm.model.common.CdmBase;
34 34
import eu.etaxonomy.cdm.model.common.ICdmBase;
35
import eu.etaxonomy.cdm.model.common.IdentifiableEntity;
35 36
import eu.etaxonomy.cdm.model.media.Rights;
36 37
import eu.etaxonomy.cdm.model.media.RightsType;
37 38
import eu.etaxonomy.cdm.model.name.HybridRelationship;
......
42 43
import eu.etaxonomy.cdm.model.reference.Reference;
43 44
import eu.etaxonomy.cdm.strategy.match.DefaultMatchStrategy;
44 45
import eu.etaxonomy.cdm.strategy.match.IMatchStrategyEqual;
46
import eu.etaxonomy.cdm.strategy.match.IMatchable;
45 47
import eu.etaxonomy.cdm.strategy.match.MatchException;
46 48
import eu.etaxonomy.cdm.strategy.match.MatchMode;
47 49

  
......
49 51
 * Helper class for deduplicating authors, references, names, etc.
50 52
 * during import.
51 53
 *
54
 * Note 2021: Was originally used as fast deduplication tool for commandline imports
55
 * into empty databases. Currently it is transformed into a deduplication tool that
56
 * can be used during application based imports.
57
 *
52 58
 * @author a.mueller
53 59
 * @since 11.02.2017
54 60
 */
......
58 64

  
59 65
    private ICdmRepository repository;
60 66

  
67
    //for possible future use
68
    @SuppressWarnings("unused")
61 69
    private ImportStateBase<?,?> state;
62 70

  
63
    boolean referenceMapIsInitialized = false;
64
    boolean nameMapIsInitialized = false;
65
    boolean agentMapIsInitialized = false;
66
    boolean copyrightMapIsInitialized = false;
67
    boolean collectionMapIsInitialized = false;
71
    public static final int NEVER_USE_MAP = 0;
72
    public static final int ALWAYS_USE_MAP = -1;
73
    //should deduplication use maps indexing the full database content? If yes, what is the maximum number of records for this.
74
    //If more records exist deduplication is done on the fly.
75
    //0 = never use map
76
    //-1 = always use map
77
    private int maxCountFullLoad = ALWAYS_USE_MAP;
78
    public int getMaxCountFullLoad() {
79
        return maxCountFullLoad;
80
    }
81
    public void setMaxCountFullLoad(int maxCountFullLoad) {
82
        this.maxCountFullLoad = maxCountFullLoad;
83
    }
84

  
85
    private enum Status{
86
        NOT_INIT,
87
        USE_MAP,
88
        USE_REPO;
89
    }
90

  
91
    private class DedupInfo<S extends IdentifiableEntity>{
92
        Class<S> clazz;
93
        IMatchStrategyEqual matcher;
94
        Map<String, Set<S>> map = new HashMap<>();
95
        Status status = Status.NOT_INIT;
96

  
97
        @SuppressWarnings("unchecked")
98
        private DedupInfo(Class<S> clazz, DedupMap dedupMap){
99
            this.clazz = clazz;
100
            if (IMatchable.class.isAssignableFrom(clazz)) {
101
                matcher = DefaultMatchStrategy.NewInstance((Class<IMatchable>)clazz);
102
            }
103
            dedupMap.put(clazz, this);
104
        }
105
        @Override
106
        public String toString() {
107
            return clazz.getSimpleName() + ":" + status.name()+":mapsize=" + map.size()+":"+ (matcher == null?"without":"with") + " matcher";
108
        }
109
    }
110

  
111
    private class DedupMap<T extends IdentifiableEntity> extends HashMap<Class<T>, DedupInfo<T>>{
112
        private static final long serialVersionUID = 3757206594833330646L;
113
    }
114
    private DedupMap<? extends IdentifiableEntity> dedupMap = new DedupMap<>();
68 115

  
116
    private DedupInfo<Reference> referenceDedupInfo = new DedupInfo<>(Reference.class, dedupMap);
117
    private DedupInfo<Person> personDedupInfo = new DedupInfo<>(Person.class, dedupMap);
118
    private DedupInfo<Team> teamDedupInfo = new DedupInfo<>(Team.class, dedupMap);
119
    private DedupInfo<TaxonName> nameDedupInfo = new DedupInfo<>(TaxonName.class, dedupMap);
69 120

  
70
    private Map<String, Set<Reference>> refMap = new HashMap<>();
71
    private Map<String, Set<Team>> teamMap = new HashMap<>();
72
    private Map<String, Set<Person>> personMap = new HashMap<>();
73
    private Map<String, Institution> institutionMap = new HashMap<>();
121

  
122
    private Status institutionStatus = Status.NOT_INIT;
123
    private Status copyrightStatus = Status.NOT_INIT;
124
    private Status collectionStatus = Status.NOT_INIT;
125

  
126
    private Map<String, Set<Institution>> institutionMap = new HashMap<>();
74 127
    //using titleCache
75
    private Map<String, Set<INonViralName>> nameMap = new HashMap<>();
76 128
    private Map<String, Set<Rights>> copyrightMap = new HashMap<>();
77 129
    private Map<String, Set<Collection>> collectionMap = new HashMap<>();
78 130

  
......
80 132
     * Clears all internal maps.
81 133
     */
82 134
    public void reset() {
83
        refMap.clear();
84
        teamMap.clear();
85
        personMap.clear();
135
        dedupMap.values().forEach(di->di.map.clear());
86 136
        institutionMap.clear();
87
        nameMap.clear();
88 137
        copyrightMap.clear();
89 138
        collectionMap.clear();
90 139
    }
91 140

  
92
    private IMatchStrategyEqual referenceMatcher = DefaultMatchStrategy.NewInstance(Reference.class);
93 141
//    private IMatchStrategy collectionMatcher = DefaultMatchStrategy.NewInstance(Collection.class);
94
    private IMatchStrategyEqual nameMatcher = DefaultMatchStrategy.NewInstance(TaxonName.class);
95
    private IMatchStrategyEqual personMatcher = DefaultMatchStrategy.NewInstance(Person.class);
96
    private IMatchStrategyEqual teamMatcher = DefaultMatchStrategy.NewInstance(Team.class);
97

  
98 142

  
99 143
 // ************************** FACTORY *******************************/
100 144

  
101
     /**
102
      * @param repository
103
      * @param state
104
      * @return
105
      */
106 145
     public static <STATE extends ImportStateBase<?,?>> ImportDeduplicationHelper NewInstance(ICdmRepository repository, STATE state){
107 146
         return new ImportDeduplicationHelper(repository, state);
108 147
     }
......
119 158
         }
120 159
         this.state = state;
121 160
         try {
122
             referenceMatcher.setMatchMode("title", MatchMode.EQUAL);
123
             teamMatcher.setMatchMode("nomenclaturalTitleCache", MatchMode.EQUAL_OR_SECOND_NULL);
161
             dedupMap.get(Reference.class).matcher.setMatchMode("title", MatchMode.EQUAL);
162
             dedupMap.get(Team.class).matcher.setMatchMode("nomenclaturalTitleCache", MatchMode.EQUAL_OR_SECOND_NULL);
124 163
         } catch (MatchException e) {
125 164
             throw new RuntimeException(e);  //should not happen
126 165
         }
......
130 169
        restartSession(repository, null);
131 170
    }
132 171

  
172
    /**
173
     * Clears all internal maps and loads them with same data as before but in current session.
174
     */
133 175
    public void restartSession(ICdmRepository repository, ImportResult importResult){
134 176
        if (repository == null){
135 177
            return;
136 178
        }
137
        refMap = refreshSetMap(refMap, (IService)repository.getReferenceService(), importResult);
138
        personMap = refreshSetMap(personMap, (IService)repository.getAgentService(), importResult);
139
        teamMap = refreshSetMap(teamMap, (IService)repository.getAgentService(), importResult);
140
        institutionMap = refreshMap(institutionMap, (IService)repository.getAgentService(), importResult);
179
        referenceDedupInfo.map = refreshSetMap(referenceDedupInfo.map, (IService)repository.getReferenceService(), importResult);
180
        personDedupInfo.map = refreshSetMap(personDedupInfo.map, (IService)repository.getAgentService(), importResult);
181
        teamDedupInfo.map = refreshSetMap(teamDedupInfo.map, (IService)repository.getAgentService(), importResult);
182
        institutionMap = refreshSetMap(institutionMap, (IService)repository.getAgentService(), importResult);
141 183

  
142
        nameMap = refreshSetMap(nameMap, (IService)repository.getNameService(), importResult);
184
        nameDedupInfo.map = refreshSetMap(nameDedupInfo.map, (IService)repository.getNameService(), importResult);
143 185
        collectionMap = refreshSetMap(collectionMap, (IService)repository.getCollectionService(), importResult);
144 186
        //TODO copyright ?
145 187
    }
146 188

  
189
    //maybe this was used for Institution before
147 190
    private <T extends ICdmBase> Map<String, T> refreshMap(Map<String, T> oldMap,
148 191
            IService<T> service, ImportResult importResult) {
192

  
149 193
        Map<String, T> newMap = new HashMap<>();
150 194
        for (String key : oldMap.keySet()){
151 195
            T old = oldMap.get(key);
......
210 254

  
211 255
//************************ PUTTER / GETTER *****************************/
212 256

  
213
    //REFERENCES
214
    private void putReference(String title, Reference ref){
215
        Set<Reference> refs = refMap.get(title);
216
        if (refs == null){
217
            refs = new HashSet<>();
218
            refMap.put(title, refs);
257
    //ENTITY
258
    private <S extends IdentifiableEntity<?>> void putEntity(String title, S entity, Map<String,Set<S>> map){
259
        Set<S> entitySet = map.get(title);
260
        if (entitySet == null){
261
            entitySet = new HashSet<>();
262
            map.put(title, entitySet);
219 263
        }
220
        refs.add(CdmBase.deproxy(ref));
264
        entitySet.add(CdmBase.deproxy(entity));
221 265
    }
222
    private Set<Reference> getReferences(String title){
223
        return refMap.get(title);
266

  
267
    private <S extends IdentifiableEntity> Set<S> getEntityByTitle(String title, DedupInfo<S> dedupInfo){
268
        return dedupInfo.map.get(title);
224 269
    }
225 270

  
226
    private Optional<Reference> getMatchingReference(Reference newReference){
227
        Predicate<Reference> matchFilter = reference ->{
271
    private <S extends IdentifiableEntity> Optional<S> getMatchingEntity(S entityOrig, DedupInfo<S> dedupInfo){
272
        S entity = CdmBase.deproxy(entityOrig);
273
        Predicate<S> matchFilter = reference ->{
228 274
            try {
229
                return referenceMatcher.invoke(reference, newReference).isSuccessful();
275
                return dedupInfo.matcher.invoke((IMatchable)reference, (IMatchable)entity).isSuccessful();
230 276
            } catch (MatchException e) {
231 277
                throw new RuntimeException(e);
232 278
            }
233 279
        };
234
        return Optional.ofNullable(getReferences(newReference.getTitleCache()))
280
        Optional<S> result = Optional.ofNullable(getEntityByTitle(entity.getTitleCache(), dedupInfo))
235 281
                .orElse(new HashSet<>())
236 282
                .stream()
237 283
                .filter(matchFilter)
238 284
                .findAny();
285
        if (result.isPresent() || dedupInfo.status == Status.USE_MAP  || repository == null){
286
            return result;
287
        }else {
288
            try {
289
                return (Optional<S>)repository.getCommonService().findMatching((IMatchable)entity, dedupInfo.matcher).stream().findFirst();
290
            } catch (MatchException e) {
291
                throw new RuntimeException(e);
292
            }
293
        }
239 294
    }
240 295

  
241 296
    // AGENTS
242 297
    private void putAgentBase(String title, AgentBase<?> agent){
243 298
        if (agent.isInstanceOf(Person.class) ){
244
            putAgent(title, CdmBase.deproxy(agent, Person.class), personMap);
299
            putEntity(title, CdmBase.deproxy(agent, Person.class), personDedupInfo.map);
245 300
        }else if (agent.isInstanceOf(Team.class)){
246
            putAgent(title, CdmBase.deproxy(agent, Team.class), teamMap);
301
            putEntity(title, CdmBase.deproxy(agent, Team.class), teamDedupInfo.map);
247 302
        }else{
248
//            putAgent(title, CdmBase.deproxy(agent, Institution.class), institutionMap);
249
            institutionMap.put(title, CdmBase.deproxy(agent, Institution.class));
250
        }
251
    }
252
    //put agent
253
    private <T extends AgentBase> void putAgent(String title, T agent, Map<String, Set<T>> map){
254
        Set<T> items = map.get(title);
255
        if (items == null){
256
            items = new HashSet<>();
257
            map.put(title, items);
303
            putEntity(title, CdmBase.deproxy(agent, Institution.class), institutionMap);
258 304
        }
259
        items.add(CdmBase.deproxy(agent));
260
    }
261

  
262
    private Optional<Person> getMatchingPerson(Person newPerson){
263
        Person newPersonDeproxy = CdmBase.deproxy(newPerson);
264
        Predicate<Person> matchFilter = (person) ->{
265
            try {
266
                return personMatcher.invoke(person, newPersonDeproxy).isSuccessful();
267
            } catch (MatchException e) {
268
                throw new RuntimeException(e);
269
            }
270
        };
271

  
272
        return Optional.ofNullable(getPersons(newPerson.getTitleCache()))
273
                .orElse(new HashSet<>())
274
                .stream()
275
                .filter(matchFilter)
276
                .findAny();
277 305
    }
278 306

  
279 307
    private <T extends TeamOrPersonBase<?>> T getTeamOrPerson(T agent){
280 308
        T result = agent;
281 309
        if (agent.isInstanceOf(Person.class)){
282
            result = (T)getMatchingPerson(CdmBase.deproxy(agent, Person.class)).orElse(null) ; // personMap.get(title);
310
            result = (T)getMatchingEntity(CdmBase.deproxy(agent, Person.class), personDedupInfo).orElse(null) ; // personMap.get(title);
283 311
        }else if (agent.isInstanceOf(Team.class)) {
284
            result = (T)getMatchingTeam(CdmBase.deproxy(agent, Team.class)).orElse(null); // teamMap.get(title);
312
            result = (T)getMatchingEntity(CdmBase.deproxy(agent, Team.class), teamDedupInfo).orElse(null); // teamMap.get(title);
285 313
        }
286 314
        return result;
287 315
    }
288 316

  
289
    private Optional<Team> getMatchingTeam(Team newTeam){
290
        Team newTeamDeproxy = CdmBase.deproxy(newTeam);
291
        Predicate<Team> matchFilter = (team) ->{
292
            try {
293
                return teamMatcher.invoke(team, newTeamDeproxy).isSuccessful();
294
            } catch (MatchException e) {
295
                throw new RuntimeException(e);
296
            }
297
        };
298
        //TODO better adapt matching strategy
299
//        newTeam.getNomenclaturalTitle();
300
        return Optional.ofNullable(getTeams(newTeam.getTitleCache()))
301
                .orElse(new HashSet<>())
302
                .stream()
303
                .filter(matchFilter)
304
                .findAny();
305
    }
306
    private Set<Person> getPersons(String title){
307
        return personMap.get(title);
308
    }
309
    private Set<Team> getTeams(String title){
310
        return teamMap.get(title);
311
    }
312

  
313
    //NAMES
314
    private void putName(String title, INonViralName name){
315
        Set<INonViralName> names = nameMap.get(title);
316
        if (names == null){
317
            names = new HashSet<>();
318
            nameMap.put(title, names);
319
        }
320
        names.add(CdmBase.deproxy(name));
321
    }
322
    private Set<INonViralName> getNames(String title){
323
        return nameMap.get(title);
324
    }
325

  
326
    private Optional<INonViralName> getMatchingName(INonViralName existing){
327
        Predicate<INonViralName> matchFilter = name ->{
328
            try {
329
                return nameMatcher.invoke(name, existing).isSuccessful();
330
            } catch (MatchException e) {
331
                throw new RuntimeException(e);
332
            }
333
        };
334
        return Optional.ofNullable(getNames(existing.getTitleCache()))
335
                .orElse(new HashSet<>())
336
                .stream()
337
                .filter(matchFilter)
338
                .findAny();
339
    }
340

  
341 317
    //COLLECTIONS
342
    private void putCollection(String title, Collection collection){
343
        Set<Collection> collections = collectionMap.get(title);
344
        if (collections == null){
345
            collections = new HashSet<>();
346
            collectionMap.put(title, collections);
347
        }
348
        collections.add(CdmBase.deproxy(collection));
349
    }
350

  
351 318
    private Set<Collection> getCollections(String title){
352 319
        return collectionMap.get(title);
353 320
    }
......
405 372
            nomRef.setAuthorship(getExistingAuthor(refAuthor));
406 373

  
407 374
            Reference existingRef = getExistingReference((Reference)nomRef);
375
            //TODO AM: why do we need to check null here (we don't do this for authors, maybe because it is an original source?)
408 376
            if (existingRef != null){
409 377
                name.setNomenclaturalReference(existingRef);
410 378
            }
411 379
        }
412 380
    }
413 381

  
382
    public void replaceReferenceRelatedData(Reference ref) {
383

  
384
        TeamOrPersonBase<?> author = ref.getAuthorship();
385
        ref.setAuthorship(getExistingAuthor(author));
386

  
387
        ref.setInReference(getExistingReference(ref.getInReference()));
388
    }
389

  
414 390
    public <T extends TeamOrPersonBase<?>> T getExistingAuthor(T author) {
415 391
        if (author == null){
416 392
            return null;
417 393
        }else{
418
            initAgentMap();
394
            //TODO
395
            init(personDedupInfo);
396
            init(teamDedupInfo);
419 397
            initAuthorTitleCaches(author);
420 398
            T result = getTeamOrPerson(author);
421 399
            if (result == null){
......
460 438
        } else if (agent.isInstanceOf(TeamOrPersonBase.class)){
461 439
            return getExistingAuthor(CdmBase.deproxy(agent, TeamOrPersonBase.class));
462 440
        }else{
463
            initAgentMap();
464
            Institution result = institutionMap.get(agent.getTitleCache());
465
            if (result == null){
466
                putAgentBase(agent.getTitleCache(), agent);
467
                result = CdmBase.deproxy(agent, Institution.class);
468
            }
469
            return result;
441
            throw new RuntimeException("Institution matching not yet implemented");
442
//            initInstitutionMap();
443
//            Set<Institution> result = institutionMap.get(agent.getTitleCache());
444
//            if (result == null){
445
//                result = putEntity(agent.getTitleCache(), CdmBase.deproxy(agent, Institution.class), institutionMap);
446
//            }
447
//            return result;
470 448
        }
471 449
    }
472 450

  
473
    @SuppressWarnings("rawtypes")
474
    private void initAgentMap() {
475
        if (!agentMapIsInitialized && repository != null){
476
            List<String> propertyPaths = Arrays.asList("");
477
            List<AgentBase> existingAgents = repository.getAgentService().list(null, null, null, null, propertyPaths);
478
            for (AgentBase agent : existingAgents){
479
                putAgentBase(agent.getTitleCache(), CdmBase.deproxy(agent));
451
    private <S extends IdentifiableEntity<?>> void init(DedupInfo<S> dedupInfo) {
452
        dedupInfo.status = init(dedupInfo.clazz, dedupInfo.status, dedupInfo.map);
453
    }
454

  
455
    private <S extends IdentifiableEntity<?>> Status init(Class<S> clazz, Status status, Map<String,Set<S>> map) {
456

  
457
        if (status == Status.NOT_INIT && repository != null){
458
            if (maxCountFullLoad != NEVER_USE_MAP){
459
                long nExisting = -2;
460
                if (maxCountFullLoad != ALWAYS_USE_MAP){
461
                    nExisting = repository.getCommonService().count(clazz);
462
                }
463
                if (nExisting <= maxCountFullLoad ){
464
                    List<String> propertyPaths = Arrays.asList("");
465
                    List<S> existingEntities = repository.getCommonService().list(clazz, null, null, null, propertyPaths);
466
                    for (S ref : existingEntities){
467
                        putEntity(ref.getTitleCache(), ref, map);
468
                    }
469
                    return Status.USE_MAP;
470
                }else{
471
                    return Status.USE_REPO;
472
                }
473
            }else{
474
                return Status.USE_REPO;
480 475
            }
481
            agentMapIsInitialized = true;
482 476
        }
477
        return status;
483 478
    }
484 479

  
485 480
    private void handleTeam(Team team) {
486 481
        List<Person> members = team.getTeamMembers();
487 482
        for (int i =0; i< members.size(); i++){
488 483
            Person person = CdmBase.deproxy(members.get(i));
489
            Person existingPerson = getMatchingPerson(person).orElse(null);
484
            Person existingPerson = getMatchingEntity(person, personDedupInfo).orElse(null);
490 485
            if (existingPerson != null){
491 486
                members.set(i, existingPerson);
492 487
            }else{
......
503 498
            Collection result = getMatchingCollections(collection).orElse(null);
504 499
            if (result == null){
505 500
                result = collection;
506
                putCollection(result.getTitleCache(), result);
501
                putEntity(result.getTitleCache(), result, collectionMap);
507 502
            }else{
508 503
                if(logger.isDebugEnabled()) {
509 504
                    logger.debug("Matches");
......
514 509
    }
515 510

  
516 511
    private void initCollectionMap() {
517
        if (!collectionMapIsInitialized && repository != null){
518
            List<String> propertyPaths = Arrays.asList("");
519
            List<Collection> existingCollections = repository.getCollectionService().list(null, null, null, null, propertyPaths);
520
            for (Collection collection : existingCollections){
521
                putCollection(collection.getTitleCache(), collection);
522
            }
523
            collectionMapIsInitialized = true;
524
        }
512
        collectionStatus = init(Collection.class, collectionStatus, collectionMap);
525 513
    }
526 514

  
527 515
   public Reference getExistingReference(Reference ref) {
528 516
       if (ref == null){
529 517
           return null;
530 518
       }else{
531
           initRerenceMap();
519
           init(referenceDedupInfo);
532 520
           initReferenceCaches(ref);
533
           Reference result = getMatchingReference(ref).orElse(null);
521
           Reference result = getMatchingEntity(ref, referenceDedupInfo).orElse(null);
534 522
           if (result == null){
535 523
               result = ref;
536 524
               Reference inRef = result.getInReference();
537 525
               if (inRef != null){
538 526
                   result.setInReference(getExistingReference(result.getInReference()));
539 527
               }
540
               putReference(result.getTitleCache(), result);
528
               putEntity(result.getTitleCache(), result, referenceDedupInfo.map);
541 529
           }else{
542
               if(logger.isDebugEnabled()) {
543
                   logger.debug("Matches");
544
                }
530
               if(logger.isDebugEnabled()) {logger.debug("Matches");}
545 531
           }
546 532
           return result;
547 533
       }
548 534
   }
549 535

  
550
   private void initRerenceMap() {
551
       if (!referenceMapIsInitialized && repository != null){
552
           List<String> propertyPaths = Arrays.asList("");
553
           List<Reference> existingReferences = repository.getReferenceService().list(null, null, null, null, propertyPaths);
554
           for (Reference ref : existingReferences){
555
               putReference(ref.getTitleCache(), ref);
556
           }
557
           referenceMapIsInitialized = true;
558
       }
559
   }
560

  
561
   public <NAME extends INonViralName> NAME getExistingName(NAME name) {
536
   public TaxonName getExistingName(TaxonName name) {
562 537
       if (name == null){
563 538
           return null;
564 539
       }else{
565
           initNameMap();
566
           @SuppressWarnings("unchecked")
567
           NAME result = (NAME)getMatchingName(name).orElse(null);
540
           init(nameDedupInfo);
541
           TaxonName result = getMatchingEntity(name, nameDedupInfo).orElse(null);
568 542
           if (result == null){
569 543
               result = name;
570 544
               Set<HybridRelationship> parentRelations = result.getHybridChildRelations();
571 545
               for (HybridRelationship rel : parentRelations){
572
                   INonViralName parent = rel.getParentName();
546
                   TaxonName parent = rel.getParentName();
573 547
                   if (parent != null){
574 548
                       rel.setParentName(getExistingName(parent));
575 549
                   }
576 550
               }
577
               putName(result.getTitleCache(), result);
551
               putEntity(result.getTitleCache(), result, nameDedupInfo.map);
578 552
           }else{
579 553
               if(logger.isDebugEnabled()) {
580 554
                   logger.debug("Matches");
......
584 558
       }
585 559
   }
586 560

  
587
   private void initNameMap() {
588
       if (!nameMapIsInitialized && repository != null){
589
           List<String> propertyPaths = Arrays.asList("");
590
           List<TaxonName> existingNames = repository.getNameService().list(null, null, null, null, propertyPaths);
591
           for (TaxonName name : existingNames){
592
               putName(name.getTitleCache(), name);
593
           }
594
          nameMapIsInitialized = true;
595
       }
596
   }
597

  
598 561
   public Rights getExistingCopyright(Rights right) {
599 562
       if (right == null || !RightsType.COPYRIGHT().equals(right.getType())){
600 563
           return null;
......
614 577
   }
615 578

  
616 579
    private void initCopyrightMap() {
617
        if (!copyrightMapIsInitialized && repository != null){
580
        if (copyrightStatus == Status.NOT_INIT && repository != null){
618 581
            List<String> propertyPaths = Arrays.asList("");
619 582
            List<Rights> existingRights = repository.getRightsService().list(null, null, null, null, propertyPaths);
620 583
            for (Rights right : existingRights){
......
622 585
                    putCopyright(makeCopyrightKey(right), right);
623 586
                }
624 587
            }
625
            copyrightMapIsInitialized = true;
588
            copyrightStatus = Status.USE_MAP;
626 589
        }
627 590
    }
628 591

  
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/reference/ris/in/RisReferenceImport.java
24 24
import eu.etaxonomy.cdm.common.DOI;
25 25
import eu.etaxonomy.cdm.common.URI;
26 26
import eu.etaxonomy.cdm.io.common.CdmImportBase;
27
import eu.etaxonomy.cdm.io.common.utils.ImportDeduplicationHelper;
27 28
import eu.etaxonomy.cdm.io.reference.ris.in.RisRecordReader.RisValue;
28 29
import eu.etaxonomy.cdm.model.agent.Person;
29 30
import eu.etaxonomy.cdm.model.agent.Team;
......
69 70
                try {
70 71
                    location = recordLocation(state, next);
71 72
                    ref = handleSingleReference(state, next);
72
                    referencesToSave.add(ref);
73
                    if (ref.getInReference() != null){
74
                        referencesToSave.add(ref.getInReference());
73

  
74
                    Reference existingRef = state.getDeduplicationHelper().getExistingReference(ref);
75
                    if (existingRef == ref){ //reference does not yet exist so the identical reference has been returned
76
                        state.getDeduplicationHelper().replaceReferenceRelatedData(ref);
77
                        referencesToSave.add(ref);
78
                        if (ref.getInReference() != null){
79
                            referencesToSave.add(ref.getInReference());
80
                        }
81
                    }else{
82
                        //merge ?
75 83
                    }
76 84
                } catch (Exception e) {
77 85
                    String message = "Unexpected exception during RIS Reference Import";
......
510 518
        return cdmType;
511 519
    }
512 520

  
521
    @Override
522
    public ImportDeduplicationHelper createDeduplicationHelper(RisReferenceImportState state){
523
        ImportDeduplicationHelper result = super.createDeduplicationHelper(state);
524
        result.setMaxCountFullLoad(state.getConfig().getDeduplicationMaxCountForFullLoad());
525
        return result;
526
    }
527

  
513 528
    @Override
514 529
    protected boolean doCheck(RisReferenceImportState state) {
515 530
        return true;
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/reference/ris/in/RisReferenceImportConfigurator.java
32 32
    private static final long serialVersionUID = -5982826645441621962L;
33 33
//    private static IInputTransformer defaultTransformer = null;
34 34

  
35
    private int deduplicationMaxCountForFullLoad = 200;
36

  
37
//********************************** FACTORY ***********************************/
38

  
35 39
    public static RisReferenceImportConfigurator NewInstance(URI uri, ICdmDataSource cdm) {
36 40
        RisReferenceImportConfigurator result = new RisReferenceImportConfigurator(uri, cdm);
37 41
        return result;
......
41 45
        InputStream stream = url.openStream();
42 46
        InputStreamReader reader = new InputStreamReader(stream, "UTF8");
43 47

  
44
        RisReferenceImportConfigurator result = new RisReferenceImportConfigurator();
48
        RisReferenceImportConfigurator result = new RisReferenceImportConfigurator(null, cdm);
45 49
        result.setStream(IOUtils.toByteArray(reader, Charset.defaultCharset()));
46 50
        return result;
47 51
    }
......
52 56
        return result;
53 57
    }
54 58

  
55
    protected RisReferenceImportConfigurator() {
56
        super(null,null);
57
    }
59
//************************ CONSTRUCTOR ****************************************/
58 60

  
59
    protected RisReferenceImportConfigurator(URI uri, ICdmDataSource cdm) {
61
    private RisReferenceImportConfigurator(URI uri, ICdmDataSource cdm) {
60 62
        super(uri, cdm, null);
61 63
    }
62 64

  
65
// ************************* GETTER / SETTER *************************************/
66

  
67
    public int getDeduplicationMaxCountForFullLoad() {
68
        return deduplicationMaxCountForFullLoad;
69
    }
70
    public void setDeduplicationMaxCountForFullLoad(int deduplicationMaxCountForFullLoad) {
71
        this.deduplicationMaxCountForFullLoad = deduplicationMaxCountForFullLoad;
72
    }
73

  
74
//********************** METHODS ********************************************/
75

  
76
    @SuppressWarnings("unchecked")
63 77
    @Override
64 78
    public RisReferenceImportState getNewState() {
65 79
        return new RisReferenceImportState(this);
......
91 105
        return true;
92 106
    }
93 107

  
94

  
95
}
108
}
cdmlib-io/src/test/java/eu/etaxonomy/cdm/io/referenceris/in/RisReferenceImportTest.java
13 13
import java.io.FileNotFoundException;
14 14
import java.io.IOException;
15 15
import java.net.URL;
16
import java.util.Collections;
16 17
import java.util.List;
17 18

  
18 19
import org.junit.Assert;
......
22 23
import org.unitils.spring.annotation.SpringBeanByName;
23 24
import org.unitils.spring.annotation.SpringBeanByType;
24 25

  
26
import eu.etaxonomy.cdm.api.service.IAgentService;
25 27
import eu.etaxonomy.cdm.api.service.IReferenceService;
26 28
//import eu.etaxonomy.cdm.common.DOI;
27 29
import eu.etaxonomy.cdm.io.common.CdmApplicationAwareDefaultImport;
28 30
import eu.etaxonomy.cdm.io.common.ImportResult;
29 31
import eu.etaxonomy.cdm.io.reference.ris.in.RisReferenceImportConfigurator;
32
import eu.etaxonomy.cdm.model.agent.Institution;
30 33
import eu.etaxonomy.cdm.model.agent.Person;
31 34
import eu.etaxonomy.cdm.model.agent.Team;
32 35
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
33 36
import eu.etaxonomy.cdm.model.common.CdmBase;
37
import eu.etaxonomy.cdm.model.common.IdentifiableEntity;
34 38
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
35 39
import eu.etaxonomy.cdm.model.reference.Reference;
36 40
import eu.etaxonomy.cdm.model.reference.ReferenceType;
......
48 52
	@SpringBeanByType
49 53
	private IReferenceService referenceService;
50 54

  
51
	private RisReferenceImportConfigurator configurator;
52
    private RisReferenceImportConfigurator configLong;
55
	@SpringBeanByType
56
    private IAgentService agentService;
53 57

  
54 58
	@Before
55
	public void setUp() {
56
		String inputFile = "/eu/etaxonomy/cdm/io/reference/ris/in/RisReferenceImportTest-input.ris";
57

  
58
        try {
59
            URL url = this.getClass().getResource(inputFile);
60
            assertNotNull("URL for the test file '" + inputFile + "' does not exist", url);
61

  
62
            String inputFileLong = "/eu/etaxonomy/cdm/io/reference/ris/in/Acantholimon.ris";
63
            URL urlLong = this.getClass().getResource(inputFileLong);
64
            assertNotNull("URL for the test file '" + inputFileLong + "' does not exist", urlLong);
65

  
66
			configurator = RisReferenceImportConfigurator.NewInstance(url, null);
67
			configLong = RisReferenceImportConfigurator.NewInstance(urlLong, null);
68

  
69
		} catch (Exception e) {
70
			e.printStackTrace();
71
			Assert.fail();
72
		}
73
		assertNotNull("Configurator could not be created", configurator);
74
	    assertNotNull("Configurator could not be created", configLong);
75
	    assertNotNull("nameService should not be null", referenceService);
76
	}
59
	public void setUp() {}
77 60

  
78 61
//***************************** TESTS *************************************//
79 62

  
......
81 64
	@DataSet( value="/eu/etaxonomy/cdm/database/ClearDBDataSet.xml", loadStrategy=CleanSweepInsertLoadStrategy.class)
82 65
	//@Ignore
83 66
    public void testShort() {
84

  
67
	    RisReferenceImportConfigurator configurator = getConfigurator("RisReferenceImportTest-input.ris");
85 68
		ImportResult result = defaultImport.invoke(configurator);
86 69
		String report = result.createReport().toString();
87 70
		Assert.assertTrue(report.length() > 0);
......
142 125
	}
143 126

  
144 127
	@Test
145
	public void testChapter() throws IOException{
146
        String inputFileLong = "/eu/etaxonomy/cdm/io/reference/ris/in/Arias2012.ris";
147
        URL urlLong = this.getClass().getResource(inputFileLong);
148
        configurator = RisReferenceImportConfigurator.NewInstance(urlLong, null);
128
	public void testChapter() {
129

  
130
	    final RisReferenceImportConfigurator configurator = getConfigurator("Arias2012.ris");
149 131

  
150 132
        ImportResult result = defaultImport.invoke(configurator);
151 133
        String report = result.createReport().toString();
......
155 137
        Integer expected = 2;
156 138
        Assert.assertEquals(expected, result.getNewRecords(Reference.class));
157 139

  
158
        List<Reference> list = referenceService.list(Reference.class, null, null, null, null);
140
        List<Reference> referenceList = referenceService.list(Reference.class, null, null, null, null);
159 141
        Assert.assertEquals("There should be 3 references, the book-section, the book and the source reference",
160
                3, list.size());
142
                3, referenceList.size());
161 143

  
162 144
        //book section
163
        Reference bookSection = list.stream().filter(r->r.getType() == ReferenceType.BookSection).findFirst().get();
145
        Reference bookSection = referenceList.stream().filter(r->r.getType() == ReferenceType.BookSection).findFirst().get();
164 146
        //... title
165 147
        Assert.assertEquals("Cactaceae", bookSection.getTitle());
166 148
        //... author
......
182 164
        Assert.assertEquals("1-235", bookSection.getPages());
183 165

  
184 166
        //book
185
        Reference book = list.stream().filter(r->r.getType() == ReferenceType.Book).findFirst().get();
167
        Reference book = referenceList.stream().filter(r->r.getType() == ReferenceType.Book).findFirst().get();
186 168
        //... title
187 169
        Assert.assertEquals("Flora del Valle de Tehuac\u00E1n-Cuicatl\u00E1n", book.getTitle());
188 170
        Assert.assertEquals("Fasc\u00EDculo 95", book.getVolume());
......
190 172
        Assert.assertEquals("Instituto de Biolog\u00EDa, Universidad Nacional Aut\u00F3noma de M\u00E9xico", book.getPublisher());
191 173

  
192 174
        //source reference
193
        Reference sourceRef = list.stream().filter(r->r.equals(configurator.getSourceReference())).findFirst().get();
175
        Reference sourceRef = referenceList.stream().filter(r->r.equals(configurator.getSourceReference())).findFirst().get();
194 176
        Assert.assertNotNull(sourceRef);
195 177
        //TODO cont.
178

  
179
        List<Person> personList = agentService.list(Person.class, null, null, null, null);
180
        Assert.assertEquals("There should be 5 persons", 5, personList.size());
181

  
182
        List<Team> teamList = agentService.list(Team.class, null, null, null, null);
183
        Assert.assertEquals("There should be 1 team", 1, teamList.size());
184

  
185

  
186
        //test deduplication by running it again
187
        result = defaultImport.invoke(configurator);
188
        report = result.createReport().toString();
189
        Assert.assertTrue(report.contains("Reference: 0"));
190
        Assert.assertEquals(0, result.getErrors().size() + result.getExceptions().size() + result.getWarnings().size());
191
        referenceList = referenceService.list(Reference.class, null, null, null, null);
192
        Assert.assertEquals("There should still be 3 references, the book-section, the book and the source reference",
193
                3, referenceList.size());
194

  
195
        personList = agentService.list(Person.class, null, null, null, null);
196
        Assert.assertEquals("There should still be 5 persons", 5, personList.size());
197

  
198
        teamList = agentService.list(Team.class, null, null, null, null);
199
        Assert.assertEquals("There should still be 1 team", 1, teamList.size());
200

  
201
        //test deduplication by running another chapter
202
        RisReferenceImportConfigurator configurator2 = getConfigurator("Arias2012_2.ris");
203
        result = defaultImport.invoke(configurator2);
204
        report = result.createReport().toString();
205
//        Assert.assertTrue(report.contains("Reference: 0"));
206
        Assert.assertEquals(0, result.getErrors().size() + result.getExceptions().size() + result.getWarnings().size());
207
        referenceList = referenceService.list(Reference.class, null, null, null, null);
208
        Assert.assertEquals("There should be 5 references, 2 book-sections, the book and 2 source references",
209
                5, referenceList.size());
210

  
211
        personList = agentService.list(Person.class, null, null, null, null);
212
        Assert.assertEquals("There should be 6 persons now", 6, personList.size());
213

  
214
        teamList = agentService.list(Team.class, null, null, null, null);
215
        Assert.assertEquals("There should be 2 teams now", 2, teamList.size());
216

  
196 217
	}
197 218

  
198
	@Test
199
	//@Ignore
200
    public void testLongFile() {
201
        ImportResult result = defaultImport.invoke(configLong);
202
        String report = result.createReport().toString();
203
        System.out.println(report);
219
    private RisReferenceImportConfigurator getConfigurator(String fileName) {
220
        String inputFile = "/eu/etaxonomy/cdm/io/reference/ris/in/" + fileName;
221
        URL url = this.getClass().getResource(inputFile);
222
        assertNotNull("URL for the test file '" + inputFile + "' does not exist", url);
223
        try {
224
            RisReferenceImportConfigurator result = RisReferenceImportConfigurator.NewInstance(url, null);
225
            result.setDeduplicationMaxCountForFullLoad(1);
226
            return result;
227
        } catch (IOException e) {
228
            Assert.fail("IOException while creating configurator: " + e.getMessage());
229
            return null;
230
        }
231
    }
204 232

  
205
        Integer expected = 118;  //did not count yet
206
        Assert.assertEquals(expected, result.getNewRecords(Reference.class));
233
    @Test
234
    public void testLongFile() {
207 235

  
208
        List<Reference> list = referenceService.list(Reference.class, null, null, null, null);
209
//        Assert.assertEquals("There should be 119 references (still need to count them)", 119, list.size());
210
        //TODO deduplication
236
        RisReferenceImportConfigurator configurator = getConfigurator("Acantholimon.ris");
237
        ImportResult result = defaultImport.invoke(configurator);
211 238

  
212
        Reference ref58 = list.stream().filter(r->hasId(r, "58", false)).findFirst().get();
239
        @SuppressWarnings("unused")
240
        String report = result.createReport().toString();
241
//        System.out.println(report);
242

  
243
//        Integer expectedWithoutDeduplication = 118;  //did not count yet
244
        Integer expectedDeduplicated = 104;  //did not count yet
245
        Assert.assertEquals(expectedDeduplicated, result.getNewRecords(Reference.class));
246
//        System.out.println("Person: "+ result.getNewRecords(Person.class));
247
//        System.out.println("Team: "+ result.getNewRecords(Team.class));
248

  
249
        List<Reference> refList = referenceService.list(Reference.class, null, null, null, null);
250
//        Assert.assertEquals("There should be 119 references (still need to count them)", 119, refList.size());
251
        Collections.sort(refList, (r1,r2) -> r1.getTitleCache().compareTo(r2.getTitleCache()));
252
        printList(refList);
253
        List<Person> personList = agentService.list(Person.class, null, null, null, null);
254
        printList(personList);
255
        Assert.assertEquals(99, personList.size());
256
        List<Team> teamList = agentService.list(Team.class, null, null, null, null);
257
        printList(teamList);
258
        Assert.assertEquals(33, teamList.size());
259
        List<Institution> institutionList = agentService.list(Institution.class, null, null, null, null);
260
        printList(institutionList);
261
        Assert.assertEquals(0, institutionList.size());
262

  
263

  
264
        Reference ref58 = refList.stream().filter(r->hasId(r, "58", false)).findFirst().get();
213 265
        Assert.assertNotNull("", ref58);
214 266
        Assert.assertEquals((Integer)2003, ref58.getDatePublished().getStartYear());
215 267

  
216
        Reference ref53 = list.stream().filter(r->hasId(r, "53", false)).findFirst().get();
268
        Reference ref53 = refList.stream().filter(r->hasId(r, "53", false)).findFirst().get();
217 269
        Assert.assertNotNull("", ref53);
218 270
        Assert.assertEquals(ReferenceType.BookSection, ref53.getType());
219 271
        Assert.assertNotNull("", ref53.getInReference());
......
265 317
//        }
266 318
    }
267 319

  
320
    private void printList(List<? extends IdentifiableEntity<?>> list) {
321
        if (!logger.isDebugEnabled()){
322
            return;
323
        }
324
        System.out.println(list.size());
325
        Collections.sort(list, (p1,p2) -> p1.getTitleCache().compareTo(p2.getTitleCache()));
326
        list.stream().forEach(r->System.out.println(r.getTitleCache()));
327
    }
328

  
268 329
    private boolean hasId(Reference ref, String idStr, boolean getInRef) {
269 330
        if (ref.getSources().size() != 1){
270 331
            return false;
cdmlib-io/src/test/resources/eu/etaxonomy/cdm/io/reference/ris/in/Arias2012_2.ris
1
TY  - CHAP
2
A2  - Medina, L. R.
3
AU  - Arias, S.
4
AU  - Gama-Cruz, A.
5
CY  - México D. F.
6
PB  - Instituto de Biología, Universidad Nacional Autónoma de México
7
PY  - 2012
8
SP  - 236-247
9
ST  - Cactaceae2
10
T2  - Flora del Valle de Tehuacán-Cuicatlán
11
TI  - Cactaceae2
12
VL  - Fascículo 95
13
ID  - 3825
14
ER  - 

Also available in: Unified diff