Project

General

Profile

« Previous | Next » 

Revision db652f5c

Added by Andreas Müller over 5 years ago

ref #7801 and ref #3787 deduplicate reference.authorstring and reference itself

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/berlinModel/in/BerlinModelReferenceImport.java
36 36
import java.util.Map;
37 37
import java.util.Set;
38 38
import java.util.UUID;
39
import java.util.regex.Matcher;
40
import java.util.regex.Pattern;
39 41

  
40 42
import org.apache.log4j.Logger;
41 43
import org.springframework.stereotype.Component;
......
71 73
import eu.etaxonomy.cdm.model.reference.IPrintSeries;
72 74
import eu.etaxonomy.cdm.model.reference.Reference;
73 75
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
76
import eu.etaxonomy.cdm.strategy.cache.agent.PersonDefaultCacheStrategy;
74 77
import eu.etaxonomy.cdm.strategy.cache.agent.TeamDefaultCacheStrategy;
75 78

  
76 79
/**
......
170 173
	private class RefCounter{
171 174
		RefCounter() {refCount = 0;}
172 175
		int refCount;
176
		int dedupCount;
173 177

  
174 178
		@Override
175
        public String toString(){return String.valueOf(refCount) ;};
179
        public String toString(){return String.valueOf(refCount) + "/" + String.valueOf(dedupCount) ;}
176 180
	}
177 181

  
178 182
	@Override
......
199 203
		String strSelectId = " SELECT Reference.RefId as refId ";
200 204
		String strSelectFull =
201 205
			" SELECT Reference.* ,InReference.RefCategoryFk as InRefCategoryFk, RefSource.RefSource " ;
202
		String strFrom =  " FROM %s  " +
203
		    	" LEFT OUTER JOIN Reference as InReference ON InReference.refId = Reference.inRefFk " +
204
				" LEFT OUTER JOIN RefSource ON Reference.RefSourceFk = RefSource.RefSourceId " +
206
		String strFrom =
207
		        " FROM %s  " +
208
		    	    " LEFT OUTER JOIN Reference as InReference ON InReference.refId = Reference.inRefFk " +
209
		    	    " LEFT OUTER JOIN RefSource ON Reference.RefSourceFk = RefSource.RefSourceId " +
205 210
		    	" WHERE (1=1) ";
211
		String strOrderBy = " ORDER BY InReference.inRefFk, Reference.inRefFk "; //to make in-references available in first run
206 212
		String strWherePartitioned = " AND (Reference.refId IN ("+ ID_LIST_TOKEN + ") ) ";
207 213

  
208 214
		String referenceTable = CdmUtils.Nz(state.getConfig().getReferenceIdTable());
......
213 219
		if (! referenceFilter.isEmpty()){
214 220
			referenceFilter = " AND " + referenceFilter + " ";
215 221
		}
216
		referenceFilter = "";  //don't use it for now
222
		referenceFilter = "";  //don't use it for now, in E+M the tabelle is directly used
217 223

  
218
		String strIdQueryFirstPath = strSelectId + strIdFrom ;
224
		String strIdQueryFirstPath = strSelectId + strIdFrom + strOrderBy ;
219 225
		String strIdQuerySecondPath = strSelectId + strIdFrom + " AND (Reference.InRefFk is NOT NULL) ";
220 226

  
221 227
//		if (config.getDoReferences() == CONCEPT_REFERENCES){
222 228
//			strIdQueryNoInRef += " AND ( Reference.refId IN ( SELECT ptRefFk FROM PTaxon) ) " + referenceFilter;
223 229
//		}
224 230

  
225
		String strRecordQuery = strSelectFull + String.format(strFrom, " Reference ") + strWherePartitioned;
231
		String strRecordQuery = strSelectFull + String.format(strFrom, " Reference ") + strWherePartitioned + strOrderBy;
226 232

  
227 233
		int recordsPerTransaction = config.getRecordsPerTransaction();
228 234
		try{
......
235 241
			logger.info("end make references without in-references ... " + getSuccessString(success));
236 242
			state.setReferenceSecondPath(true);
237 243

  
238
//			if (config.getDoReferences() == ALL || config.getDoReferences() == NOMENCLATURAL){
239

  
240 244
			//secondPath
241
			partitioner = ResultSetPartitioner.NewInstance(source, strIdQuerySecondPath, strRecordQuery, recordsPerTransaction);
242
			while (partitioner.nextPartition()){
243
				partitioner.doPartition(this, state);
244
			}
245
			logger.info("end make references with no 1 in-reference ... " + getSuccessString(success));
246
			state.setReferenceSecondPath(false);
247

  
245
//			partitioner = ResultSetPartitioner.NewInstance(source, strIdQuerySecondPath, strRecordQuery, recordsPerTransaction);
246
//			while (partitioner.nextPartition()){
247
//			    //currently not used as inRef assignment fully works through sorting of idQuery now, at least in E+M
248
//				partitioner.doPartition(this, state);
248 249
//			}
250
//			logger.info("end make references with no 1 in-reference ... " + getSuccessString(success));
251
			state.setReferenceSecondPath(false);
249 252

  
250 253
		} catch (SQLException e) {
251 254
			logger.error("SQLException:" +  e);
......
271 274

  
272 275
		Map<Integer, Reference> refToSave = new HashMap<>();
273 276

  
274
		@SuppressWarnings("unchecked")
275
        Map<String, Reference> relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE);
277
//		@SuppressWarnings("unchecked")
278
//        Map<String, Reference> relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE);
276 279

  
277 280
		BerlinModelImportConfigurator config = state.getConfig();
278 281

  
......
286 289
			while (rs.next()){
287 290
				if ((i++ % modCount) == 0 && i!= 1 ){ logger.info("References handled: " + (i-1) + " in round -" );}
288 291

  
289
				success &= makeSingleReferenceRecord(rs, state, partitioner, refToSave, relatedReferences, refCounter);
292
				success &= makeSingleReferenceRecord(rs, state, partitioner, refToSave, refCounter);
290 293
			} // end resultSet
291 294

  
292 295
			//for the concept reference a fixed uuid may be needed -> change uuid
......
299 302
			}
300 303

  
301 304
			//save and store in map
302
			logger.info("Save references (" + refCounter.refCount + ")");
305
			logger.warn("Save references (" + refCounter.toString() + ")");  //set preliminary to warn for printing dedup count
306

  
303 307
			getReferenceService().saveOrUpdate(refToSave.values());
304 308

  
305 309
//			logger.info("end makeReferences ..." + getSuccessString(success));;
......
354 358
							    thisRef.setTitleCache(null);
355 359
							    thisRef.getTitleCache();
356 360
							}
361
						}else{
362
						    logger.warn("Reference which has an inReference not found in DB. RefId: " + refId);
357 363
						}
358 364
						if(inRefFk.equals(0)){
359 365
						    logger.warn("InRefFk is 0 for refId "+ refId);
......
363 369
				} // end resultSet
364 370

  
365 371
				//save and store in map
366
				logger.info("Save references (" + refCounter.refCount + ")");
372
				logger.info("Save in references (" + refCounter.toString() + ")");
367 373
				getReferenceService().saveOrUpdate(refToSave.values());
368 374

  
369 375
//			}//end resultSetList
......
449 455
				BerlinModelImportState state,
450 456
				ResultSetPartitioner<BerlinModelImportState> partitioner,
451 457
				Map<Integer, Reference> refToSave,
452
				Map<String, Reference> relatedReferences,
453 458
				RefCounter refCounter){
454 459

  
455 460
	    boolean success = true;
......
503 508
			//created, updated, notes
504 509
			doCreatedUpdatedNotes(state, reference, rs);
505 510

  
506
			//idInSource
511
			//idInSource (import from older source to berlin model)
512
			//TODO do we want this being imported? Maybe as alternatvie identifier?
507 513
			String idInSource = (String)valueMap.get("IdInSource".toLowerCase());
508 514
			if (isNotBlank(idInSource)){
509 515
				IdentifiableSource source = IdentifiableSource.NewDataImportInstance(idInSource);
......
545 551
				Reference ref,
546 552
				RefCounter refCounter,
547 553
				Map<Integer, Reference> refToSave
548
				) throws SQLException{
554
			) throws SQLException{
549 555

  
550 556
		@SuppressWarnings("unchecked")
551 557
        Map<String, Team> teamMap = partitioner.getObjectMap(BerlinModelAuthorTeamImport.NAMESPACE);
552 558

  
553
		String refCache = rs.getString("refCache");
554
		String nomRefCache = rs.getString("nomRefCache");
555
		String title = rs.getString("title");
556
		String nomTitleAbbrev = rs.getString("nomTitleAbbrev");
559
		String refCache = trim(rs.getString("refCache"));
560
		String nomRefCache = trim(rs.getString("nomRefCache"));
561
		String title = trim(rs.getString("title"));
562
		String nomTitleAbbrev = trim(rs.getString("nomTitleAbbrev"));
557 563
		boolean isPreliminary = rs.getBoolean("PreliminaryFlag");
558
		String refAuthorString = rs.getString("refAuthorString");
564
		String refAuthorString = trim(rs.getString("refAuthorString"));
559 565
		Integer nomAuthorTeamFk = nullSafeInt(rs, "NomAuthorTeamFk");
566
		Integer inRefFk = nullSafeInt(rs, "inRefFk");
567

  
560 568

  
561 569
		TeamOrPersonBase<?> nomAuthor = null;
562 570
		if (nomAuthorTeamFk != null){
563 571
		    String strNomAuthorTeamFk = String.valueOf(nomAuthorTeamFk);
564 572
		    nomAuthor = teamMap.get(strNomAuthorTeamFk);
565 573
		    if (nomAuthor == null){
566
		        logger.warn("NomAuthor ("+strNomAuthorTeamFk+") not found in teamMap for " + refId);
574
		        logger.warn("NomAuthor ("+strNomAuthorTeamFk+") not found in teamMap (but it should exist) for " + refId);
567 575
		    }
568 576
		}
569 577

  
......
587 595
		TeamOrPersonBase<?> author = getAuthorship(state, refAuthorString, nomAuthor, refId);
588 596
		ref.setAuthorship(author);
589 597

  
598
		//inRef
599
		Reference inRef = null;
600
		if (inRefFk != null){
601
		    @SuppressWarnings({"unchecked" })
602
		    Map<String, Reference>  relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE);
603
		    inRef = relatedReferences.get(String.valueOf(inRefFk));
604
		    if (inRef == null){
605
		        inRef = refToSave.get(inRefFk);
606
		    }
607
		    if (inRef == null){
608
		        logger.warn("InRef not (yet) found. RefId: " + refId + "; InRef: "+ inRefFk);
609
		    }else{
610
		        ref.setInReference(inRef);
611
		    }
612
		}
613

  
614
		Reference result = deduplicateReference(state, ref);
615
		if(ref != result){
616
		    //dedup not possible at this point because inRef exists but is not yet defined
617
		    if (inRefFk != null && inRef == null){
618
		        result = ref;
619
		        logger.warn("Ref has deduplication candidate but inRef is still missing. " + inRef);
620
		    }else{
621
		        logger.debug("Reference was deduplicated. RefId: " + refId);
622
		        //FIXME also check annotations etc. for deduplication
623
		        refCounter.dedupCount++;
624
		    }
625
		}else{
626
		    refCounter.refCount++;
627
		}
628

  
590 629
		//save
591 630
		if (! refToSave.containsKey(refId)){
592
			refToSave.put(refId, ref);
631
			refToSave.put(refId, result);
593 632
		}else{
633
		    //should not happen
594 634
			logger.warn("Duplicate refId in Berlin Model database. Second reference was not imported !!");
595 635
		}
596
		refCounter.refCount++;
636

  
597 637

  
598 638
		//refId
599
		ImportHelper.setOriginalSource(ref, sourceReference, refId, REFERENCE_NAMESPACE);
639
		ImportHelper.setOriginalSource(result, sourceReference, refId, REFERENCE_NAMESPACE);
600 640

  
601 641
		if (commonNameRefSet != null && commonNameRefSet.contains(refId)){
602
            ref.addMarker(Marker.NewInstance(MarkerType.COMMON_NAME_REFERENCE(), true));
642
		    result.addMarker(Marker.NewInstance(MarkerType.COMMON_NAME_REFERENCE(), true));
603 643
        }
604 644

  
605 645
		return true;
606 646
	}
607 647

  
608 648
	/**
649
     * @param string
650
     * @return
651
     */
652
    private String trim(String string) {
653
        if (string == null){
654
            return null;
655
        }else{
656
            return string.trim();
657
        }
658
    }
659

  
660
    /**
609 661
	 * Copies the created and updated information from the nomReference to the cloned bibliographic reference
610 662
	 * @param referenceBase
611 663
	 * @param nomReference
......
912 964
	}
913 965

  
914 966

  
915
	private static TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String refAuthorString,
967
	private TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String refAuthorString,
916 968
	        TeamOrPersonBase<?> nomAuthor, Integer refId){
917 969

  
918 970
	    TeamOrPersonBase<?> result;
919 971
		if (nomAuthor != null){
920 972
			result = nomAuthor;
921 973
			if (isNotBlank(refAuthorString) && !nomAuthor.getTitleCache().equals(refAuthorString)){
922
			    boolean isSimilar = handleSimilarAuthors(state, refAuthorString, nomAuthor);
974
			    boolean isSimilar = handleSimilarAuthors(state, refAuthorString, nomAuthor, refId);
923 975
			    if (! isSimilar){
924
			        logger.warn("refAuthorString differs from nomAuthor.titleCache: " + refAuthorString
925
			                + " <-> " + nomAuthor.getTitleCache() + "; RefId: " + refId);
976
			        String message = "refAuthorString differs from nomAuthor.titleCache: " + refAuthorString
977
                            + " <-> " + nomAuthor.getTitleCache() + "; RefId: " + refId;
978
			        logger.warn(message);
926 979
			    }
927 980
			}
928

  
929
		} else if (isNotBlank(refAuthorString)){
981
		} else if (isNotBlank(refAuthorString)){//only RefAuthorString exists
930 982
		    refAuthorString = refAuthorString.trim();
931 983
			//TODO match with existing Persons/Teams
932 984
		    TeamOrPersonBase<?> author = state.getRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, TeamOrPersonBase.class);
933 985
			if (author == null){
934 986
			    if (!BerlinModelAuthorTeamImport.hasTeamSeparator(refAuthorString)){
935
			        author = makePerson(refAuthorString, refId);
987
			        author = makePerson(refAuthorString, false, refId);
936 988
			    }else{
937 989
			        author = makeTeam(state, refAuthorString, refId);
938 990
			    }
939 991
			    state.addRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, author);
940
			    author.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null);
992
			    result = deduplicatePersonOrTeam(state, author);
993

  
994
			    if (result != author){
995
                    logger.debug("RefAuthorString author deduplicated " + author);
996
                }else{
997
                    if (!importSourceExists(author, refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference() )){
998
                        author.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null);
999
                    }
1000
                }
1001
			}else{
1002
			    logger.debug("RefAuthor loaded from map");
941 1003
			}
942 1004
			result = author;
943 1005
		}else{
......
947 1009
		return result;
948 1010
	}
949 1011

  
1012

  
950 1013
    /**
951 1014
     * @param state
952 1015
     * @param refAuthorString
953 1016
     * @param refId
954 1017
     * @return
955 1018
     */
956
    private static Team makeTeam(BerlinModelImportState state, String refAuthorString, Integer refId) {
1019
    private TeamOrPersonBase<?> makeTeam(BerlinModelImportState state, String refAuthorString, Integer refId) {
957 1020
        Team team = Team.NewInstance();
1021
        boolean hasDedupMember = false;
958 1022
        if (containsEdOrColon(refAuthorString)){
959 1023
            team.setTitleCache(refAuthorString, true);
960 1024
        }else{
961
            String[] fullTeams = BerlinModelAuthorTeamImport.splitTeam(refAuthorString);
1025
            String[] refAuthorTeams = BerlinModelAuthorTeamImport.splitTeam(refAuthorString);
962 1026
            boolean lastWasInitials = false;
963
            for (int i = 0; i< fullTeams.length ;i++){
1027
            for (int i = 0; i< refAuthorTeams.length ;i++){
964 1028
                if (lastWasInitials){
965 1029
                    lastWasInitials = false;
966 1030
                    continue;
967 1031
                }
968
                String fullTeam = fullTeams[i].trim();
1032
                String fullTeam = refAuthorTeams[i].trim();
969 1033
                String initials = null;
970
                if (fullTeams.length > i+1){
971
                    String nextSplit = fullTeams[i+1].trim();
1034
                if (refAuthorTeams.length > i+1){
1035
                    String nextSplit = refAuthorTeams[i+1].trim();
972 1036
                    if (isInitial(nextSplit)){
973 1037
                        lastWasInitials = true;
974 1038
                        initials = nextSplit;
975 1039
                    }
976 1040
                }
977
                Person member = makePerson(fullTeam, refId);
978

  
979
                if (initials != null && !member.isProtectedTitleCache()){
980
                    member.setInitials(initials);
981
                }else if (initials != null){
982
                    member.setTitleCache(member.getTitleCache() + ", " + initials, true);
1041
                Person member = makePerson(fullTeam, isNotBlank(initials), refId);
1042

  
1043
                if (initials != null){
1044
                    if (member.getInitials() != null){
1045
                        logger.warn("Initials already set: " + refId);
1046
                    }else if (!member.isProtectedTitleCache()){
1047
                        member.setInitials(initials);
1048
                    }else {
1049
                        member.setTitleCache(member.getTitleCache() + ", " + initials, true);
1050
                    }
983 1051
                }
984 1052

  
985
                if (i == fullTeams.length -1 && BerlinModelAuthorTeamImport.isEtAl(member)){
1053
                if (i == refAuthorTeams.length -1 && BerlinModelAuthorTeamImport.isEtAl(member)){
986 1054
                    team.setHasMoreMembers(true);
987 1055
                }else{
988
                    Person dedupMember = deduplicatePerson(state, member);
1056
                    Person dedupMember = deduplicatePersonOrTeam(state, member);
989 1057
                    if (dedupMember != member){
990
                        logger.debug("Member deduplicated: " + refId);
1058
                        hasDedupMember = true;
991 1059
                    }else{
992
                        member.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null);
1060
                        if (!importSourceExists(member, refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference())){
1061
                            member.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null);
1062
                        }
993 1063
                    }
994
                    //TODO add idInBM
1064

  
995 1065
                    team.addTeamMember(dedupMember);
996 1066
                }
997 1067
            }
998 1068
        }
999 1069

  
1070
        TeamOrPersonBase<?> result = team;
1071
        if (team.getTeamMembers().size() == 1 && !team.isHasMoreMembers()){
1072
            Person person = team.getTeamMembers().get(0);
1073
            checkPerson(person, refAuthorString, hasDedupMember, refId);
1074
            result = person;
1075
        }else{
1076
            checkTeam(team, refAuthorString, refId);
1077
            result = team;
1078
        }
1079

  
1080
        return result;
1081
    }
1082

  
1083
    /**
1084
     * @param team
1085
     * @param refAuthorString
1086
     * @param refId
1087
     */
1088
    private static void checkTeam(Team team, String refAuthorString, Integer refId) {
1000 1089
        TeamDefaultCacheStrategy formatter = (TeamDefaultCacheStrategy) team.getCacheStrategy();
1001 1090
        formatter.setEtAlPosition(100);
1002 1091
        if (formatter.getTitleCache(team).equals(refAuthorString)){
......
1009 1098
            team.setProtectedTitleCache(false);
1010 1099
        }else if (containsEdOrColon(refAuthorString)){
1011 1100
            //nothing to do, it is expected to be protected
1101

  
1012 1102
        }else{
1013 1103
            team.setTitleCache(refAuthorString, true);
1014 1104
            logger.warn("Creation of titleCache for team with members did not (fully) work: " + refAuthorString + " <-> " + formatter.getTitleCache(team)+ " : " + refId);
1015 1105
        }
1016
        return team;
1106

  
1107
    }
1108

  
1109
    /**
1110
     * @param hasDedupMember
1111
     * @param result
1112
     * @return
1113
     */
1114
    private static void checkPerson(Person person, String refAuthorString, boolean hasDedupMember, Integer refId) {
1115
        PersonDefaultCacheStrategy formatter = (PersonDefaultCacheStrategy) person.getCacheStrategy();
1116

  
1117
        String oldTitleCache = person.getTitleCache();
1118
        boolean oldTitleCacheProtected = person.isProtectedTitleCache();
1119

  
1120
        if (! oldTitleCache.equals(refAuthorString)){
1121
            logger.error("Old titleCache does not equal refAuthorString this should not happen. "+ oldTitleCache + " <-> " + refAuthorString + "; refId = " + refId);
1122
        }
1123

  
1124
        boolean protect = true;
1125
        person.setProtectedTitleCache(false);
1126
        if (refAuthorString.equals(formatter.getTitleCache(person))){
1127
            protect = false;
1128
        }else if(formatter.getFullTitle(person).equals(refAuthorString)){
1129
            //.. or teams with initials first
1130
            protect = false;
1131
        }else{
1132
            //keep protected, see below
1133
        }
1134

  
1135
        if (hasDedupMember){
1136
            //restore
1137
            //TODO maybe even do not use dedup for testing
1138
            person.setTitleCache(oldTitleCache, oldTitleCacheProtected);
1139
            if (protect != oldTitleCacheProtected){
1140
                logger.warn("Deduplicated person protection requirement unclear for "+refAuthorString+". New:"+protect+"/Old:"+oldTitleCacheProtected+"; RefId: " + refId);
1141
            }
1142
        }else{
1143
            if (protect){
1144
                logger.warn("Creation of titleCache for person (converted from team) with members did not (fully) work: " + refAuthorString + " <-> " + formatter.getTitleCache(person)+ " : " + refId);
1145
                person.setTitleCache(refAuthorString, protect);
1146
            }else{
1147
                //keep unprotected
1148
            }
1149
        }
1017 1150
    }
1018 1151

  
1019 1152
    /**
......
1023 1156
    private static boolean containsEdOrColon(String str) {
1024 1157
        if (str.contains(" ed.") || str.contains(" Ed.") || str.contains("(ed.")
1025 1158
                || str.contains("[ed.") || str.contains("(Eds)") || str.contains("(Eds.)") ||
1026
                str.contains("(eds.)") || str.contains(":")|| str.contains(";")){
1159
                str.contains("(eds.)") || str.contains(":")|| str.contains(";") || str.contains("Publ. & Inform. Directorate")
1160
                || str.contains("Anonymous [Department of Botany, Faculty of Science, FER-ZPR, University of Zagreb]")
1161
                || str.contains("Davis, P. H. (Güner, A. & al.)")){
1027 1162
            return true;
1028 1163
        }else{
1029 1164
            return false;
......
1038 1173
        if (str == null){
1039 1174
            return false;
1040 1175
        }
1041
        boolean matches = str.trim().matches("(\\p{javaUpperCase}|Yu|Th|Ch|Lj|Sz|Dz|Sh)\\.?(\\s*[-\\s]\\s*(\\p{javaUpperCase}|Yu)\\.?)*(\\s+(van|von))?");
1176
        boolean matches = str.trim().matches("(\\p{javaUpperCase}|Yu|Ya|Th|Ch|Lj|Sz|Dz|Sh|Ju|R. M. da S)\\.?"
1177
                + "(\\s*[-\\s]\\s*(\\p{javaUpperCase}|Yu|Ja|Kh|Tz|Ya|Th|Ju)\\.?)*(\\s+(van|von|de|de la|del|da|van der))?");
1042 1178
        return matches;
1043 1179
    }
1044 1180

  
1045
    private static Person deduplicatePerson(BerlinModelImportState state, Person person) {
1046
        Person result = deduplicationHelper.getExistingAuthor(state, person);
1181
    private <T extends TeamOrPersonBase<?>> T deduplicatePersonOrTeam(BerlinModelImportState state,T author) {
1182
        T result = deduplicationHelper.getExistingAuthor(state, author);
1047 1183
        return result;
1048 1184
    }
1049 1185

  
1050
    private static Person makePerson(String full, Integer refId) {
1186
    private Reference deduplicateReference(BerlinModelImportState state,Reference ref) {
1187
        Reference result = deduplicationHelper.getExistingReference(state, ref);
1188
        return result;
1189
    }
1190

  
1191
    private static Person makePerson(String full, boolean followedByInitial, Integer refId) {
1051 1192
        Person person = Person.NewInstance();
1052 1193
        person.setTitleCache(full, true);
1053 1194
        if (!full.matches(".*[\\s\\.].*")){
1054 1195
            person.setFamilyName(full);
1055 1196
            person.setProtectedTitleCache(false);
1056
        }else if (full.matches("(\\p{javaUpperCase}|Kh)\\.(\\s\\p{javaUpperCase}\\.)*\\s\\p{javaUpperCase}\\p{javaLowerCase}{2,}")){
1057
            String[] splits = full.split("\\s");
1058
            person.setFamilyName(splits[splits.length-1]);
1059
            String initials = splits[0];
1060
            for (int i = 1; i < splits.length -1; i++ ){
1061
                initials += " " + splits[i];
1062
            }
1063
            person.setInitials(initials);
1064
            person.setProtectedTitleCache(false);
1197
        }else{
1198
            parsePerson(person, full, true, followedByInitial);
1065 1199
        }
1200

  
1066 1201
        if ((full.length() <= 2 && !full.matches("(Li|Bo|Em|Ay|Ma)")) || (full.length() == 3 && full.endsWith(".") && !full.equals("al.")) ){
1067
//            if (!full.matches("((L|Sm|DC|al|Sw|Qz|Fr|Ib)\\.|Hu|Ma|Hy|Wu)")){
1068
                logger.warn("Unexpected short nom author name part: " + full + "; " + refId);
1069
//            }
1202
            logger.warn("Unexpected short nom author name part: " + full + "; " + refId);
1070 1203
        }
1071 1204

  
1072 1205
        return person;
1073 1206
    }
1074 1207

  
1208
    private static void parsePerson(Person person, String str, boolean preliminary, boolean followedByInitial) {
1209
        String capWord = "\\p{javaUpperCase}\\p{javaLowerCase}{2,}";
1210
        String famStart = "(Le |D'|'t |Mc|Mac|Des |d'|Du |De |Al-)";
1211
        String regEx = "((\\p{javaUpperCase}|Ya|Th|Ju|Kh|An)\\.([\\s-]\\p{javaUpperCase}\\.)*(\\s(de|del|da|von|van|van der|v.|af|zu|von M. Und L.))?\\s)("
1212
                + famStart + "?" + capWord + "((-| y | i | é | de | de la )" + capWord + ")?)";
1213
        Matcher matcher = Pattern.compile(regEx).matcher(str);
1214
        if (matcher.matches()){
1215
            person.setProtectedTitleCache(false);
1216
            String familyName = matcher.group(6).trim();
1217
            person.setFamilyName(familyName);
1218
            person.setInitials(matcher.group(1).trim());
1219
        }else{
1220
            String regEx2 = "("+ capWord + "\\s" + capWord + "|Le Sueur|Beck von Mannagetta|Di Martino|Galán de Mera|Van Der Maesen|Farga i Arquimbau|Perez de Paz|Borzatti de Loewenstern|Lo Giudice|Perez de Paz)";
1221
            Matcher matcher2 = Pattern.compile(regEx2).matcher(str);
1222
            if (followedByInitial && matcher2.matches()){
1223
                person.setFamilyName(str);
1224
                person.setProtectedTitleCache(false);
1225
            }else{
1226
                person.setTitleCache(str, preliminary);
1227
            }
1228
        }
1229
    }
1230

  
1075 1231
    /**
1076 1232
     * @param state
1077 1233
     * @param refAuthorString
......
1079 1235
     * @return
1080 1236
     */
1081 1237
    private static boolean handleSimilarAuthors(BerlinModelImportState state, String refAuthorString,
1082
            TeamOrPersonBase<?> nomAuthor) {
1238
            TeamOrPersonBase<?> nomAuthor, int refId) {
1239
        String nomTitle = nomAuthor.getTitleCache();
1240

  
1083 1241
        if (refAuthorString.equals(nomAuthor.getNomenclaturalTitle())){
1084 1242
            //nomTitle equal
1085 1243
            return true;
1086 1244
        }else{
1087
            String nomTitle = nomAuthor.getTitleCache();
1088 1245
            if (refAuthorString.replace(" & ", ", ").equals(nomTitle.replace(" & ", ", "))){
1089 1246
                //nomTitle equal except for "&"
1090 1247
                return true;
1091 1248
            }
1092

  
1093
            if (refAuthorString.replace(" & ", ", ").equals(nomAuthor.getFullTitle().replace(" & ", ", "))){
1249
            String nomFullTitle = nomAuthor.getFullTitle();
1250
            if (refAuthorString.replace(" & ", ", ").equals(nomFullTitle.replace(" & ", ", "))){
1094 1251
                return true;
1095 1252
            }
1096 1253

  
1097
            if (refAuthorString.contains(",") && !nomTitle.contains(",") && nomAuthor.isInstanceOf(Person.class)){
1098
                String[] splits = refAuthorString.split(",");
1254
            if (nomAuthor.isInstanceOf(Person.class)){
1099 1255
                Person person = CdmBase.deproxy(nomAuthor, Person.class);
1100
                if (splits.length == 2){
1101
                    String newMatch = splits[1].trim() + " " + splits[0].trim();
1102
                    if (newMatch.equals(nomTitle)){
1103
                        if (isBlank(person.getFamilyName())){
1104
                            person.setFamilyName(splits[0].trim());
1105
                        }
1106
                        if (isBlank(person.getInitials())){
1107
                            person.setInitials(splits[1].trim());
1256

  
1257
                //refAuthor has initials behind, nom Author in front // the other way round is handled in firstIsFullNameOfInitialName
1258
                if (refAuthorString.contains(",") && !nomTitle.contains(",") ){
1259
                    String[] splits = refAuthorString.split(",");
1260
                    if (splits.length == 2){
1261
                        String newMatch = splits[1].trim() + " " + splits[0].trim();
1262
                        if (newMatch.equals(nomTitle)){
1263
                            if (isBlank(person.getFamilyName())){
1264
                                person.setFamilyName(splits[0].trim());
1265
                            }
1266
                            if (isBlank(person.getInitials())){
1267
                                person.setInitials(splits[1].trim());
1268
                            }
1269
                            return true;
1108 1270
                        }
1109
                        return true;
1110 1271
                    }
1111 1272
                }
1273

  
1274
                if (refAuthorIsFamilyAuthorOfNomAuthor(state, refAuthorString, person)){
1275
                    return true;
1276
                }
1277

  
1278
                if (firstIsFullNameOfInitialName(state, refAuthorString, person, refId)){
1279
                    return true;
1280
                }
1281
            }
1282

  
1283
        }
1284
        return false;
1285
    }
1286

  
1287
    /**
1288
     * @param state
1289
     * @param refAuthorString
1290
     * @param person
1291
     * @return
1292
     */
1293
    private static boolean refAuthorIsFamilyAuthorOfNomAuthor(BerlinModelImportState state, String refAuthorString,
1294
            Person person) {
1295
        if (refAuthorString.equals(person.getFamilyName())){
1296
            return true;
1297
        }else{
1298
            return false;
1299
        }
1300
    }
1301

  
1302
    /**
1303
     * @param state
1304
     * @param refAuthorString
1305
     * @param nomAuthor
1306
     * @return
1307
     */
1308
    private static boolean firstIsFullNameOfInitialName(BerlinModelImportState state, String fullName,
1309
            Person initialAuthor, int refId) {
1310
        String initialName = initialAuthor.getTitleCache();
1311

  
1312
        String[] fullSplits = fullName.split(",");
1313
        String[] initialSplits = initialName.split(",");
1314

  
1315
        if (fullSplits.length == 2 && initialSplits.length == 2){
1316
            String[] fullGivenName = fullSplits[1].trim().split(" ");
1317
            String[] initialsGivenName = initialSplits[1].trim().split(" ");
1318
            boolean result = compareFamilyAndInitials(fullSplits[0], initialSplits[0], fullGivenName, initialsGivenName);
1319
            if (result){
1320
                setGivenName(state, fullSplits[1], initialAuthor, refId);
1321
            }
1322
            return result;
1323
        }else if (fullSplits.length == 1 && initialSplits.length == 2){
1324
            String[] fullSingleSplits = fullName.split(" ");
1325
            String fullFamily = fullSingleSplits[fullSingleSplits.length-1];
1326
            String[] fullGivenName = Arrays.copyOfRange(fullSingleSplits, 0, fullSingleSplits.length-1);
1327
            String[] initialsGivenName = initialSplits[1].trim().split(" ");
1328
            boolean result =  compareFamilyAndInitials(fullFamily, initialSplits[0], fullGivenName, initialsGivenName);
1329
            if (result){
1330
                if(hasAtLeastOneFullName(fullGivenName)){
1331
                    setGivenName(state, CdmUtils.concat(" ", fullGivenName), initialAuthor, refId);
1332
                }
1333
            }
1334
            return result;
1335
        }else if (fullSplits.length == 1 && initialAuthor.getInitials() == null){
1336
            //don't if this will be implemented, initialAuthors with only nomencl.Author set
1337
        }
1338

  
1339
        return false;
1340
    }
1341

  
1342
    /**
1343
     * @param fullGivenName
1344
     * @return
1345
     */
1346
    private static boolean hasAtLeastOneFullName(String[] fullGivenName) {
1347
        for (String singleName : fullGivenName){
1348
            if (!singleName.endsWith(".") && singleName.length() > 2 && !singleName.matches("(von|van)") ){
1349
                return true;
1112 1350
            }
1113 1351
        }
1114 1352
        return false;
1115 1353
    }
1116 1354

  
1117 1355
    /**
1356
     * @param state
1357
     * @param string
1358
     * @param initialAuthor
1359
     */
1360
    private static void setGivenName(BerlinModelImportState state, String givenName, Person person, int refId) {
1361
        givenName = givenName.trim();
1362
        if(person.getGivenName() == null || person.getGivenName().equals(givenName)){
1363
            person.setGivenName(givenName);
1364
        }else{
1365
            logger.warn("RefAuthor given name and existing given name differ: " + givenName + " <-> " + person.getGivenName() + "; RefId + " + refId);
1366
        }
1367
    }
1368

  
1369
    /**
1370
     * @param fullGivenName
1371
     * @param initialsGivenName
1372
     */
1373
    protected static boolean compareFamilyAndInitials(String fullFamilyName, String initialsFamilyName,
1374
            String[] fullGivenName, String[] initialsGivenName) {
1375
        if (!fullFamilyName.equals(initialsFamilyName)){
1376
            return false;
1377
        }
1378
        if (fullGivenName.length == initialsGivenName.length){
1379
            for (int i =0; i< fullGivenName.length ; i++){
1380
                if (fullGivenName[i].length() == 0  //comma ending not allowed
1381
                        || initialsGivenName[i].length() != 2 //only K. or similar allowed
1382
                        || fullGivenName[i].length() < initialsGivenName[i].length()  //fullFirstName must be longer than abbrev Name
1383
                        || !initialsGivenName[i].endsWith(".") //initials must end with "."
1384
                        || !fullGivenName[i].startsWith(initialsGivenName[i].replace(".", ""))){ //start with same letter
1385
                    if (fullGivenName[i].matches("(von|van|de|zu)") && fullGivenName[i].equals(initialsGivenName[i])){
1386
                        continue;
1387
                    }else{
1388
                        return false;
1389
                    }
1390
                }
1391
            }
1392
            return true;
1393
        }else{
1394
            return false;
1395
        }
1396
    }
1397

  
1398
    /**
1118 1399
	 * @param lowerCase
1119 1400
	 * @param config
1120 1401
	 * @return

Also available in: Unified diff