Revision db652f5c
Added by Andreas Müller over 5 years ago
app-import/src/main/java/eu/etaxonomy/cdm/io/berlinModel/in/BerlinModelReferenceImport.java | ||
---|---|---|
36 | 36 |
import java.util.Map; |
37 | 37 |
import java.util.Set; |
38 | 38 |
import java.util.UUID; |
39 |
import java.util.regex.Matcher; |
|
40 |
import java.util.regex.Pattern; |
|
39 | 41 |
|
40 | 42 |
import org.apache.log4j.Logger; |
41 | 43 |
import org.springframework.stereotype.Component; |
... | ... | |
71 | 73 |
import eu.etaxonomy.cdm.model.reference.IPrintSeries; |
72 | 74 |
import eu.etaxonomy.cdm.model.reference.Reference; |
73 | 75 |
import eu.etaxonomy.cdm.model.reference.ReferenceFactory; |
76 |
import eu.etaxonomy.cdm.strategy.cache.agent.PersonDefaultCacheStrategy; |
|
74 | 77 |
import eu.etaxonomy.cdm.strategy.cache.agent.TeamDefaultCacheStrategy; |
75 | 78 |
|
76 | 79 |
/** |
... | ... | |
170 | 173 |
private class RefCounter{ |
171 | 174 |
RefCounter() {refCount = 0;} |
172 | 175 |
int refCount; |
176 |
int dedupCount; |
|
173 | 177 |
|
174 | 178 |
@Override |
175 |
public String toString(){return String.valueOf(refCount) ;};
|
|
179 |
public String toString(){return String.valueOf(refCount) + "/" + String.valueOf(dedupCount) ;}
|
|
176 | 180 |
} |
177 | 181 |
|
178 | 182 |
@Override |
... | ... | |
199 | 203 |
String strSelectId = " SELECT Reference.RefId as refId "; |
200 | 204 |
String strSelectFull = |
201 | 205 |
" SELECT Reference.* ,InReference.RefCategoryFk as InRefCategoryFk, RefSource.RefSource " ; |
202 |
String strFrom = " FROM %s " + |
|
203 |
" LEFT OUTER JOIN Reference as InReference ON InReference.refId = Reference.inRefFk " + |
|
204 |
" LEFT OUTER JOIN RefSource ON Reference.RefSourceFk = RefSource.RefSourceId " + |
|
206 |
String strFrom = |
|
207 |
" FROM %s " + |
|
208 |
" LEFT OUTER JOIN Reference as InReference ON InReference.refId = Reference.inRefFk " + |
|
209 |
" LEFT OUTER JOIN RefSource ON Reference.RefSourceFk = RefSource.RefSourceId " + |
|
205 | 210 |
" WHERE (1=1) "; |
211 |
String strOrderBy = " ORDER BY InReference.inRefFk, Reference.inRefFk "; //to make in-references available in first run |
|
206 | 212 |
String strWherePartitioned = " AND (Reference.refId IN ("+ ID_LIST_TOKEN + ") ) "; |
207 | 213 |
|
208 | 214 |
String referenceTable = CdmUtils.Nz(state.getConfig().getReferenceIdTable()); |
... | ... | |
213 | 219 |
if (! referenceFilter.isEmpty()){ |
214 | 220 |
referenceFilter = " AND " + referenceFilter + " "; |
215 | 221 |
} |
216 |
referenceFilter = ""; //don't use it for now |
|
222 |
referenceFilter = ""; //don't use it for now, in E+M the tabelle is directly used
|
|
217 | 223 |
|
218 |
String strIdQueryFirstPath = strSelectId + strIdFrom ; |
|
224 |
String strIdQueryFirstPath = strSelectId + strIdFrom + strOrderBy ;
|
|
219 | 225 |
String strIdQuerySecondPath = strSelectId + strIdFrom + " AND (Reference.InRefFk is NOT NULL) "; |
220 | 226 |
|
221 | 227 |
// if (config.getDoReferences() == CONCEPT_REFERENCES){ |
222 | 228 |
// strIdQueryNoInRef += " AND ( Reference.refId IN ( SELECT ptRefFk FROM PTaxon) ) " + referenceFilter; |
223 | 229 |
// } |
224 | 230 |
|
225 |
String strRecordQuery = strSelectFull + String.format(strFrom, " Reference ") + strWherePartitioned; |
|
231 |
String strRecordQuery = strSelectFull + String.format(strFrom, " Reference ") + strWherePartitioned + strOrderBy;
|
|
226 | 232 |
|
227 | 233 |
int recordsPerTransaction = config.getRecordsPerTransaction(); |
228 | 234 |
try{ |
... | ... | |
235 | 241 |
logger.info("end make references without in-references ... " + getSuccessString(success)); |
236 | 242 |
state.setReferenceSecondPath(true); |
237 | 243 |
|
238 |
// if (config.getDoReferences() == ALL || config.getDoReferences() == NOMENCLATURAL){ |
|
239 |
|
|
240 | 244 |
//secondPath |
241 |
partitioner = ResultSetPartitioner.NewInstance(source, strIdQuerySecondPath, strRecordQuery, recordsPerTransaction); |
|
242 |
while (partitioner.nextPartition()){ |
|
243 |
partitioner.doPartition(this, state); |
|
244 |
} |
|
245 |
logger.info("end make references with no 1 in-reference ... " + getSuccessString(success)); |
|
246 |
state.setReferenceSecondPath(false); |
|
247 |
|
|
245 |
// partitioner = ResultSetPartitioner.NewInstance(source, strIdQuerySecondPath, strRecordQuery, recordsPerTransaction); |
|
246 |
// while (partitioner.nextPartition()){ |
|
247 |
// //currently not used as inRef assignment fully works through sorting of idQuery now, at least in E+M |
|
248 |
// partitioner.doPartition(this, state); |
|
248 | 249 |
// } |
250 |
// logger.info("end make references with no 1 in-reference ... " + getSuccessString(success)); |
|
251 |
state.setReferenceSecondPath(false); |
|
249 | 252 |
|
250 | 253 |
} catch (SQLException e) { |
251 | 254 |
logger.error("SQLException:" + e); |
... | ... | |
271 | 274 |
|
272 | 275 |
Map<Integer, Reference> refToSave = new HashMap<>(); |
273 | 276 |
|
274 |
@SuppressWarnings("unchecked") |
|
275 |
Map<String, Reference> relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE); |
|
277 |
// @SuppressWarnings("unchecked")
|
|
278 |
// Map<String, Reference> relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE);
|
|
276 | 279 |
|
277 | 280 |
BerlinModelImportConfigurator config = state.getConfig(); |
278 | 281 |
|
... | ... | |
286 | 289 |
while (rs.next()){ |
287 | 290 |
if ((i++ % modCount) == 0 && i!= 1 ){ logger.info("References handled: " + (i-1) + " in round -" );} |
288 | 291 |
|
289 |
success &= makeSingleReferenceRecord(rs, state, partitioner, refToSave, relatedReferences, refCounter);
|
|
292 |
success &= makeSingleReferenceRecord(rs, state, partitioner, refToSave, refCounter); |
|
290 | 293 |
} // end resultSet |
291 | 294 |
|
292 | 295 |
//for the concept reference a fixed uuid may be needed -> change uuid |
... | ... | |
299 | 302 |
} |
300 | 303 |
|
301 | 304 |
//save and store in map |
302 |
logger.info("Save references (" + refCounter.refCount + ")"); |
|
305 |
logger.warn("Save references (" + refCounter.toString() + ")"); //set preliminary to warn for printing dedup count |
|
306 |
|
|
303 | 307 |
getReferenceService().saveOrUpdate(refToSave.values()); |
304 | 308 |
|
305 | 309 |
// logger.info("end makeReferences ..." + getSuccessString(success));; |
... | ... | |
354 | 358 |
thisRef.setTitleCache(null); |
355 | 359 |
thisRef.getTitleCache(); |
356 | 360 |
} |
361 |
}else{ |
|
362 |
logger.warn("Reference which has an inReference not found in DB. RefId: " + refId); |
|
357 | 363 |
} |
358 | 364 |
if(inRefFk.equals(0)){ |
359 | 365 |
logger.warn("InRefFk is 0 for refId "+ refId); |
... | ... | |
363 | 369 |
} // end resultSet |
364 | 370 |
|
365 | 371 |
//save and store in map |
366 |
logger.info("Save references (" + refCounter.refCount + ")");
|
|
372 |
logger.info("Save in references (" + refCounter.toString() + ")");
|
|
367 | 373 |
getReferenceService().saveOrUpdate(refToSave.values()); |
368 | 374 |
|
369 | 375 |
// }//end resultSetList |
... | ... | |
449 | 455 |
BerlinModelImportState state, |
450 | 456 |
ResultSetPartitioner<BerlinModelImportState> partitioner, |
451 | 457 |
Map<Integer, Reference> refToSave, |
452 |
Map<String, Reference> relatedReferences, |
|
453 | 458 |
RefCounter refCounter){ |
454 | 459 |
|
455 | 460 |
boolean success = true; |
... | ... | |
503 | 508 |
//created, updated, notes |
504 | 509 |
doCreatedUpdatedNotes(state, reference, rs); |
505 | 510 |
|
506 |
//idInSource |
|
511 |
//idInSource (import from older source to berlin model) |
|
512 |
//TODO do we want this being imported? Maybe as alternatvie identifier? |
|
507 | 513 |
String idInSource = (String)valueMap.get("IdInSource".toLowerCase()); |
508 | 514 |
if (isNotBlank(idInSource)){ |
509 | 515 |
IdentifiableSource source = IdentifiableSource.NewDataImportInstance(idInSource); |
... | ... | |
545 | 551 |
Reference ref, |
546 | 552 |
RefCounter refCounter, |
547 | 553 |
Map<Integer, Reference> refToSave |
548 |
) throws SQLException{
|
|
554 |
) throws SQLException{ |
|
549 | 555 |
|
550 | 556 |
@SuppressWarnings("unchecked") |
551 | 557 |
Map<String, Team> teamMap = partitioner.getObjectMap(BerlinModelAuthorTeamImport.NAMESPACE); |
552 | 558 |
|
553 |
String refCache = rs.getString("refCache");
|
|
554 |
String nomRefCache = rs.getString("nomRefCache");
|
|
555 |
String title = rs.getString("title");
|
|
556 |
String nomTitleAbbrev = rs.getString("nomTitleAbbrev");
|
|
559 |
String refCache = trim(rs.getString("refCache"));
|
|
560 |
String nomRefCache = trim(rs.getString("nomRefCache"));
|
|
561 |
String title = trim(rs.getString("title"));
|
|
562 |
String nomTitleAbbrev = trim(rs.getString("nomTitleAbbrev"));
|
|
557 | 563 |
boolean isPreliminary = rs.getBoolean("PreliminaryFlag"); |
558 |
String refAuthorString = rs.getString("refAuthorString");
|
|
564 |
String refAuthorString = trim(rs.getString("refAuthorString"));
|
|
559 | 565 |
Integer nomAuthorTeamFk = nullSafeInt(rs, "NomAuthorTeamFk"); |
566 |
Integer inRefFk = nullSafeInt(rs, "inRefFk"); |
|
567 |
|
|
560 | 568 |
|
561 | 569 |
TeamOrPersonBase<?> nomAuthor = null; |
562 | 570 |
if (nomAuthorTeamFk != null){ |
563 | 571 |
String strNomAuthorTeamFk = String.valueOf(nomAuthorTeamFk); |
564 | 572 |
nomAuthor = teamMap.get(strNomAuthorTeamFk); |
565 | 573 |
if (nomAuthor == null){ |
566 |
logger.warn("NomAuthor ("+strNomAuthorTeamFk+") not found in teamMap for " + refId); |
|
574 |
logger.warn("NomAuthor ("+strNomAuthorTeamFk+") not found in teamMap (but it should exist) for " + refId);
|
|
567 | 575 |
} |
568 | 576 |
} |
569 | 577 |
|
... | ... | |
587 | 595 |
TeamOrPersonBase<?> author = getAuthorship(state, refAuthorString, nomAuthor, refId); |
588 | 596 |
ref.setAuthorship(author); |
589 | 597 |
|
598 |
//inRef |
|
599 |
Reference inRef = null; |
|
600 |
if (inRefFk != null){ |
|
601 |
@SuppressWarnings({"unchecked" }) |
|
602 |
Map<String, Reference> relatedReferences = partitioner.getObjectMap(REFERENCE_NAMESPACE); |
|
603 |
inRef = relatedReferences.get(String.valueOf(inRefFk)); |
|
604 |
if (inRef == null){ |
|
605 |
inRef = refToSave.get(inRefFk); |
|
606 |
} |
|
607 |
if (inRef == null){ |
|
608 |
logger.warn("InRef not (yet) found. RefId: " + refId + "; InRef: "+ inRefFk); |
|
609 |
}else{ |
|
610 |
ref.setInReference(inRef); |
|
611 |
} |
|
612 |
} |
|
613 |
|
|
614 |
Reference result = deduplicateReference(state, ref); |
|
615 |
if(ref != result){ |
|
616 |
//dedup not possible at this point because inRef exists but is not yet defined |
|
617 |
if (inRefFk != null && inRef == null){ |
|
618 |
result = ref; |
|
619 |
logger.warn("Ref has deduplication candidate but inRef is still missing. " + inRef); |
|
620 |
}else{ |
|
621 |
logger.debug("Reference was deduplicated. RefId: " + refId); |
|
622 |
//FIXME also check annotations etc. for deduplication |
|
623 |
refCounter.dedupCount++; |
|
624 |
} |
|
625 |
}else{ |
|
626 |
refCounter.refCount++; |
|
627 |
} |
|
628 |
|
|
590 | 629 |
//save |
591 | 630 |
if (! refToSave.containsKey(refId)){ |
592 |
refToSave.put(refId, ref);
|
|
631 |
refToSave.put(refId, result);
|
|
593 | 632 |
}else{ |
633 |
//should not happen |
|
594 | 634 |
logger.warn("Duplicate refId in Berlin Model database. Second reference was not imported !!"); |
595 | 635 |
} |
596 |
refCounter.refCount++; |
|
636 |
|
|
597 | 637 |
|
598 | 638 |
//refId |
599 |
ImportHelper.setOriginalSource(ref, sourceReference, refId, REFERENCE_NAMESPACE);
|
|
639 |
ImportHelper.setOriginalSource(result, sourceReference, refId, REFERENCE_NAMESPACE);
|
|
600 | 640 |
|
601 | 641 |
if (commonNameRefSet != null && commonNameRefSet.contains(refId)){ |
602 |
ref.addMarker(Marker.NewInstance(MarkerType.COMMON_NAME_REFERENCE(), true));
|
|
642 |
result.addMarker(Marker.NewInstance(MarkerType.COMMON_NAME_REFERENCE(), true));
|
|
603 | 643 |
} |
604 | 644 |
|
605 | 645 |
return true; |
606 | 646 |
} |
607 | 647 |
|
608 | 648 |
/** |
649 |
* @param string |
|
650 |
* @return |
|
651 |
*/ |
|
652 |
private String trim(String string) { |
|
653 |
if (string == null){ |
|
654 |
return null; |
|
655 |
}else{ |
|
656 |
return string.trim(); |
|
657 |
} |
|
658 |
} |
|
659 |
|
|
660 |
/** |
|
609 | 661 |
* Copies the created and updated information from the nomReference to the cloned bibliographic reference |
610 | 662 |
* @param referenceBase |
611 | 663 |
* @param nomReference |
... | ... | |
912 | 964 |
} |
913 | 965 |
|
914 | 966 |
|
915 |
private static TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String refAuthorString,
|
|
967 |
private TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String refAuthorString, |
|
916 | 968 |
TeamOrPersonBase<?> nomAuthor, Integer refId){ |
917 | 969 |
|
918 | 970 |
TeamOrPersonBase<?> result; |
919 | 971 |
if (nomAuthor != null){ |
920 | 972 |
result = nomAuthor; |
921 | 973 |
if (isNotBlank(refAuthorString) && !nomAuthor.getTitleCache().equals(refAuthorString)){ |
922 |
boolean isSimilar = handleSimilarAuthors(state, refAuthorString, nomAuthor); |
|
974 |
boolean isSimilar = handleSimilarAuthors(state, refAuthorString, nomAuthor, refId);
|
|
923 | 975 |
if (! isSimilar){ |
924 |
logger.warn("refAuthorString differs from nomAuthor.titleCache: " + refAuthorString |
|
925 |
+ " <-> " + nomAuthor.getTitleCache() + "; RefId: " + refId); |
|
976 |
String message = "refAuthorString differs from nomAuthor.titleCache: " + refAuthorString |
|
977 |
+ " <-> " + nomAuthor.getTitleCache() + "; RefId: " + refId; |
|
978 |
logger.warn(message); |
|
926 | 979 |
} |
927 | 980 |
} |
928 |
|
|
929 |
} else if (isNotBlank(refAuthorString)){ |
|
981 |
} else if (isNotBlank(refAuthorString)){//only RefAuthorString exists |
|
930 | 982 |
refAuthorString = refAuthorString.trim(); |
931 | 983 |
//TODO match with existing Persons/Teams |
932 | 984 |
TeamOrPersonBase<?> author = state.getRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, TeamOrPersonBase.class); |
933 | 985 |
if (author == null){ |
934 | 986 |
if (!BerlinModelAuthorTeamImport.hasTeamSeparator(refAuthorString)){ |
935 |
author = makePerson(refAuthorString, refId); |
|
987 |
author = makePerson(refAuthorString, false, refId);
|
|
936 | 988 |
}else{ |
937 | 989 |
author = makeTeam(state, refAuthorString, refId); |
938 | 990 |
} |
939 | 991 |
state.addRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, author); |
940 |
author.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
992 |
result = deduplicatePersonOrTeam(state, author); |
|
993 |
|
|
994 |
if (result != author){ |
|
995 |
logger.debug("RefAuthorString author deduplicated " + author); |
|
996 |
}else{ |
|
997 |
if (!importSourceExists(author, refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference() )){ |
|
998 |
author.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
999 |
} |
|
1000 |
} |
|
1001 |
}else{ |
|
1002 |
logger.debug("RefAuthor loaded from map"); |
|
941 | 1003 |
} |
942 | 1004 |
result = author; |
943 | 1005 |
}else{ |
... | ... | |
947 | 1009 |
return result; |
948 | 1010 |
} |
949 | 1011 |
|
1012 |
|
|
950 | 1013 |
/** |
951 | 1014 |
* @param state |
952 | 1015 |
* @param refAuthorString |
953 | 1016 |
* @param refId |
954 | 1017 |
* @return |
955 | 1018 |
*/ |
956 |
private static Team makeTeam(BerlinModelImportState state, String refAuthorString, Integer refId) {
|
|
1019 |
private TeamOrPersonBase<?> makeTeam(BerlinModelImportState state, String refAuthorString, Integer refId) {
|
|
957 | 1020 |
Team team = Team.NewInstance(); |
1021 |
boolean hasDedupMember = false; |
|
958 | 1022 |
if (containsEdOrColon(refAuthorString)){ |
959 | 1023 |
team.setTitleCache(refAuthorString, true); |
960 | 1024 |
}else{ |
961 |
String[] fullTeams = BerlinModelAuthorTeamImport.splitTeam(refAuthorString);
|
|
1025 |
String[] refAuthorTeams = BerlinModelAuthorTeamImport.splitTeam(refAuthorString);
|
|
962 | 1026 |
boolean lastWasInitials = false; |
963 |
for (int i = 0; i< fullTeams.length ;i++){
|
|
1027 |
for (int i = 0; i< refAuthorTeams.length ;i++){
|
|
964 | 1028 |
if (lastWasInitials){ |
965 | 1029 |
lastWasInitials = false; |
966 | 1030 |
continue; |
967 | 1031 |
} |
968 |
String fullTeam = fullTeams[i].trim();
|
|
1032 |
String fullTeam = refAuthorTeams[i].trim();
|
|
969 | 1033 |
String initials = null; |
970 |
if (fullTeams.length > i+1){
|
|
971 |
String nextSplit = fullTeams[i+1].trim();
|
|
1034 |
if (refAuthorTeams.length > i+1){
|
|
1035 |
String nextSplit = refAuthorTeams[i+1].trim();
|
|
972 | 1036 |
if (isInitial(nextSplit)){ |
973 | 1037 |
lastWasInitials = true; |
974 | 1038 |
initials = nextSplit; |
975 | 1039 |
} |
976 | 1040 |
} |
977 |
Person member = makePerson(fullTeam, refId); |
|
978 |
|
|
979 |
if (initials != null && !member.isProtectedTitleCache()){ |
|
980 |
member.setInitials(initials); |
|
981 |
}else if (initials != null){ |
|
982 |
member.setTitleCache(member.getTitleCache() + ", " + initials, true); |
|
1041 |
Person member = makePerson(fullTeam, isNotBlank(initials), refId); |
|
1042 |
|
|
1043 |
if (initials != null){ |
|
1044 |
if (member.getInitials() != null){ |
|
1045 |
logger.warn("Initials already set: " + refId); |
|
1046 |
}else if (!member.isProtectedTitleCache()){ |
|
1047 |
member.setInitials(initials); |
|
1048 |
}else { |
|
1049 |
member.setTitleCache(member.getTitleCache() + ", " + initials, true); |
|
1050 |
} |
|
983 | 1051 |
} |
984 | 1052 |
|
985 |
if (i == fullTeams.length -1 && BerlinModelAuthorTeamImport.isEtAl(member)){
|
|
1053 |
if (i == refAuthorTeams.length -1 && BerlinModelAuthorTeamImport.isEtAl(member)){
|
|
986 | 1054 |
team.setHasMoreMembers(true); |
987 | 1055 |
}else{ |
988 |
Person dedupMember = deduplicatePerson(state, member); |
|
1056 |
Person dedupMember = deduplicatePersonOrTeam(state, member);
|
|
989 | 1057 |
if (dedupMember != member){ |
990 |
logger.debug("Member deduplicated: " + refId);
|
|
1058 |
hasDedupMember = true;
|
|
991 | 1059 |
}else{ |
992 |
member.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
1060 |
if (!importSourceExists(member, refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference())){ |
|
1061 |
member.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
1062 |
} |
|
993 | 1063 |
} |
994 |
//TODO add idInBM |
|
1064 |
|
|
995 | 1065 |
team.addTeamMember(dedupMember); |
996 | 1066 |
} |
997 | 1067 |
} |
998 | 1068 |
} |
999 | 1069 |
|
1070 |
TeamOrPersonBase<?> result = team; |
|
1071 |
if (team.getTeamMembers().size() == 1 && !team.isHasMoreMembers()){ |
|
1072 |
Person person = team.getTeamMembers().get(0); |
|
1073 |
checkPerson(person, refAuthorString, hasDedupMember, refId); |
|
1074 |
result = person; |
|
1075 |
}else{ |
|
1076 |
checkTeam(team, refAuthorString, refId); |
|
1077 |
result = team; |
|
1078 |
} |
|
1079 |
|
|
1080 |
return result; |
|
1081 |
} |
|
1082 |
|
|
1083 |
/** |
|
1084 |
* @param team |
|
1085 |
* @param refAuthorString |
|
1086 |
* @param refId |
|
1087 |
*/ |
|
1088 |
private static void checkTeam(Team team, String refAuthorString, Integer refId) { |
|
1000 | 1089 |
TeamDefaultCacheStrategy formatter = (TeamDefaultCacheStrategy) team.getCacheStrategy(); |
1001 | 1090 |
formatter.setEtAlPosition(100); |
1002 | 1091 |
if (formatter.getTitleCache(team).equals(refAuthorString)){ |
... | ... | |
1009 | 1098 |
team.setProtectedTitleCache(false); |
1010 | 1099 |
}else if (containsEdOrColon(refAuthorString)){ |
1011 | 1100 |
//nothing to do, it is expected to be protected |
1101 |
|
|
1012 | 1102 |
}else{ |
1013 | 1103 |
team.setTitleCache(refAuthorString, true); |
1014 | 1104 |
logger.warn("Creation of titleCache for team with members did not (fully) work: " + refAuthorString + " <-> " + formatter.getTitleCache(team)+ " : " + refId); |
1015 | 1105 |
} |
1016 |
return team; |
|
1106 |
|
|
1107 |
} |
|
1108 |
|
|
1109 |
/** |
|
1110 |
* @param hasDedupMember |
|
1111 |
* @param result |
|
1112 |
* @return |
|
1113 |
*/ |
|
1114 |
private static void checkPerson(Person person, String refAuthorString, boolean hasDedupMember, Integer refId) { |
|
1115 |
PersonDefaultCacheStrategy formatter = (PersonDefaultCacheStrategy) person.getCacheStrategy(); |
|
1116 |
|
|
1117 |
String oldTitleCache = person.getTitleCache(); |
|
1118 |
boolean oldTitleCacheProtected = person.isProtectedTitleCache(); |
|
1119 |
|
|
1120 |
if (! oldTitleCache.equals(refAuthorString)){ |
|
1121 |
logger.error("Old titleCache does not equal refAuthorString this should not happen. "+ oldTitleCache + " <-> " + refAuthorString + "; refId = " + refId); |
|
1122 |
} |
|
1123 |
|
|
1124 |
boolean protect = true; |
|
1125 |
person.setProtectedTitleCache(false); |
|
1126 |
if (refAuthorString.equals(formatter.getTitleCache(person))){ |
|
1127 |
protect = false; |
|
1128 |
}else if(formatter.getFullTitle(person).equals(refAuthorString)){ |
|
1129 |
//.. or teams with initials first |
|
1130 |
protect = false; |
|
1131 |
}else{ |
|
1132 |
//keep protected, see below |
|
1133 |
} |
|
1134 |
|
|
1135 |
if (hasDedupMember){ |
|
1136 |
//restore |
|
1137 |
//TODO maybe even do not use dedup for testing |
|
1138 |
person.setTitleCache(oldTitleCache, oldTitleCacheProtected); |
|
1139 |
if (protect != oldTitleCacheProtected){ |
|
1140 |
logger.warn("Deduplicated person protection requirement unclear for "+refAuthorString+". New:"+protect+"/Old:"+oldTitleCacheProtected+"; RefId: " + refId); |
|
1141 |
} |
|
1142 |
}else{ |
|
1143 |
if (protect){ |
|
1144 |
logger.warn("Creation of titleCache for person (converted from team) with members did not (fully) work: " + refAuthorString + " <-> " + formatter.getTitleCache(person)+ " : " + refId); |
|
1145 |
person.setTitleCache(refAuthorString, protect); |
|
1146 |
}else{ |
|
1147 |
//keep unprotected |
|
1148 |
} |
|
1149 |
} |
|
1017 | 1150 |
} |
1018 | 1151 |
|
1019 | 1152 |
/** |
... | ... | |
1023 | 1156 |
private static boolean containsEdOrColon(String str) { |
1024 | 1157 |
if (str.contains(" ed.") || str.contains(" Ed.") || str.contains("(ed.") |
1025 | 1158 |
|| str.contains("[ed.") || str.contains("(Eds)") || str.contains("(Eds.)") || |
1026 |
str.contains("(eds.)") || str.contains(":")|| str.contains(";")){ |
|
1159 |
str.contains("(eds.)") || str.contains(":")|| str.contains(";") || str.contains("Publ. & Inform. Directorate") |
|
1160 |
|| str.contains("Anonymous [Department of Botany, Faculty of Science, FER-ZPR, University of Zagreb]") |
|
1161 |
|| str.contains("Davis, P. H. (Güner, A. & al.)")){ |
|
1027 | 1162 |
return true; |
1028 | 1163 |
}else{ |
1029 | 1164 |
return false; |
... | ... | |
1038 | 1173 |
if (str == null){ |
1039 | 1174 |
return false; |
1040 | 1175 |
} |
1041 |
boolean matches = str.trim().matches("(\\p{javaUpperCase}|Yu|Th|Ch|Lj|Sz|Dz|Sh)\\.?(\\s*[-\\s]\\s*(\\p{javaUpperCase}|Yu)\\.?)*(\\s+(van|von))?"); |
|
1176 |
boolean matches = str.trim().matches("(\\p{javaUpperCase}|Yu|Ya|Th|Ch|Lj|Sz|Dz|Sh|Ju|R. M. da S)\\.?" |
|
1177 |
+ "(\\s*[-\\s]\\s*(\\p{javaUpperCase}|Yu|Ja|Kh|Tz|Ya|Th|Ju)\\.?)*(\\s+(van|von|de|de la|del|da|van der))?"); |
|
1042 | 1178 |
return matches; |
1043 | 1179 |
} |
1044 | 1180 |
|
1045 |
private static Person deduplicatePerson(BerlinModelImportState state, Person person) {
|
|
1046 |
Person result = deduplicationHelper.getExistingAuthor(state, person);
|
|
1181 |
private <T extends TeamOrPersonBase<?>> T deduplicatePersonOrTeam(BerlinModelImportState state,T author) {
|
|
1182 |
T result = deduplicationHelper.getExistingAuthor(state, author);
|
|
1047 | 1183 |
return result; |
1048 | 1184 |
} |
1049 | 1185 |
|
1050 |
private static Person makePerson(String full, Integer refId) { |
|
1186 |
private Reference deduplicateReference(BerlinModelImportState state,Reference ref) { |
|
1187 |
Reference result = deduplicationHelper.getExistingReference(state, ref); |
|
1188 |
return result; |
|
1189 |
} |
|
1190 |
|
|
1191 |
private static Person makePerson(String full, boolean followedByInitial, Integer refId) { |
|
1051 | 1192 |
Person person = Person.NewInstance(); |
1052 | 1193 |
person.setTitleCache(full, true); |
1053 | 1194 |
if (!full.matches(".*[\\s\\.].*")){ |
1054 | 1195 |
person.setFamilyName(full); |
1055 | 1196 |
person.setProtectedTitleCache(false); |
1056 |
}else if (full.matches("(\\p{javaUpperCase}|Kh)\\.(\\s\\p{javaUpperCase}\\.)*\\s\\p{javaUpperCase}\\p{javaLowerCase}{2,}")){ |
|
1057 |
String[] splits = full.split("\\s"); |
|
1058 |
person.setFamilyName(splits[splits.length-1]); |
|
1059 |
String initials = splits[0]; |
|
1060 |
for (int i = 1; i < splits.length -1; i++ ){ |
|
1061 |
initials += " " + splits[i]; |
|
1062 |
} |
|
1063 |
person.setInitials(initials); |
|
1064 |
person.setProtectedTitleCache(false); |
|
1197 |
}else{ |
|
1198 |
parsePerson(person, full, true, followedByInitial); |
|
1065 | 1199 |
} |
1200 |
|
|
1066 | 1201 |
if ((full.length() <= 2 && !full.matches("(Li|Bo|Em|Ay|Ma)")) || (full.length() == 3 && full.endsWith(".") && !full.equals("al.")) ){ |
1067 |
// if (!full.matches("((L|Sm|DC|al|Sw|Qz|Fr|Ib)\\.|Hu|Ma|Hy|Wu)")){ |
|
1068 |
logger.warn("Unexpected short nom author name part: " + full + "; " + refId); |
|
1069 |
// } |
|
1202 |
logger.warn("Unexpected short nom author name part: " + full + "; " + refId); |
|
1070 | 1203 |
} |
1071 | 1204 |
|
1072 | 1205 |
return person; |
1073 | 1206 |
} |
1074 | 1207 |
|
1208 |
private static void parsePerson(Person person, String str, boolean preliminary, boolean followedByInitial) { |
|
1209 |
String capWord = "\\p{javaUpperCase}\\p{javaLowerCase}{2,}"; |
|
1210 |
String famStart = "(Le |D'|'t |Mc|Mac|Des |d'|Du |De |Al-)"; |
|
1211 |
String regEx = "((\\p{javaUpperCase}|Ya|Th|Ju|Kh|An)\\.([\\s-]\\p{javaUpperCase}\\.)*(\\s(de|del|da|von|van|van der|v.|af|zu|von M. Und L.))?\\s)(" |
|
1212 |
+ famStart + "?" + capWord + "((-| y | i | é | de | de la )" + capWord + ")?)"; |
|
1213 |
Matcher matcher = Pattern.compile(regEx).matcher(str); |
|
1214 |
if (matcher.matches()){ |
|
1215 |
person.setProtectedTitleCache(false); |
|
1216 |
String familyName = matcher.group(6).trim(); |
|
1217 |
person.setFamilyName(familyName); |
|
1218 |
person.setInitials(matcher.group(1).trim()); |
|
1219 |
}else{ |
|
1220 |
String regEx2 = "("+ capWord + "\\s" + capWord + "|Le Sueur|Beck von Mannagetta|Di Martino|Galán de Mera|Van Der Maesen|Farga i Arquimbau|Perez de Paz|Borzatti de Loewenstern|Lo Giudice|Perez de Paz)"; |
|
1221 |
Matcher matcher2 = Pattern.compile(regEx2).matcher(str); |
|
1222 |
if (followedByInitial && matcher2.matches()){ |
|
1223 |
person.setFamilyName(str); |
|
1224 |
person.setProtectedTitleCache(false); |
|
1225 |
}else{ |
|
1226 |
person.setTitleCache(str, preliminary); |
|
1227 |
} |
|
1228 |
} |
|
1229 |
} |
|
1230 |
|
|
1075 | 1231 |
/** |
1076 | 1232 |
* @param state |
1077 | 1233 |
* @param refAuthorString |
... | ... | |
1079 | 1235 |
* @return |
1080 | 1236 |
*/ |
1081 | 1237 |
private static boolean handleSimilarAuthors(BerlinModelImportState state, String refAuthorString, |
1082 |
TeamOrPersonBase<?> nomAuthor) { |
|
1238 |
TeamOrPersonBase<?> nomAuthor, int refId) { |
|
1239 |
String nomTitle = nomAuthor.getTitleCache(); |
|
1240 |
|
|
1083 | 1241 |
if (refAuthorString.equals(nomAuthor.getNomenclaturalTitle())){ |
1084 | 1242 |
//nomTitle equal |
1085 | 1243 |
return true; |
1086 | 1244 |
}else{ |
1087 |
String nomTitle = nomAuthor.getTitleCache(); |
|
1088 | 1245 |
if (refAuthorString.replace(" & ", ", ").equals(nomTitle.replace(" & ", ", "))){ |
1089 | 1246 |
//nomTitle equal except for "&" |
1090 | 1247 |
return true; |
1091 | 1248 |
} |
1092 |
|
|
1093 |
if (refAuthorString.replace(" & ", ", ").equals(nomAuthor.getFullTitle().replace(" & ", ", "))){
|
|
1249 |
String nomFullTitle = nomAuthor.getFullTitle(); |
|
1250 |
if (refAuthorString.replace(" & ", ", ").equals(nomFullTitle.replace(" & ", ", "))){
|
|
1094 | 1251 |
return true; |
1095 | 1252 |
} |
1096 | 1253 |
|
1097 |
if (refAuthorString.contains(",") && !nomTitle.contains(",") && nomAuthor.isInstanceOf(Person.class)){ |
|
1098 |
String[] splits = refAuthorString.split(","); |
|
1254 |
if (nomAuthor.isInstanceOf(Person.class)){ |
|
1099 | 1255 |
Person person = CdmBase.deproxy(nomAuthor, Person.class); |
1100 |
if (splits.length == 2){ |
|
1101 |
String newMatch = splits[1].trim() + " " + splits[0].trim(); |
|
1102 |
if (newMatch.equals(nomTitle)){ |
|
1103 |
if (isBlank(person.getFamilyName())){ |
|
1104 |
person.setFamilyName(splits[0].trim()); |
|
1105 |
} |
|
1106 |
if (isBlank(person.getInitials())){ |
|
1107 |
person.setInitials(splits[1].trim()); |
|
1256 |
|
|
1257 |
//refAuthor has initials behind, nom Author in front // the other way round is handled in firstIsFullNameOfInitialName |
|
1258 |
if (refAuthorString.contains(",") && !nomTitle.contains(",") ){ |
|
1259 |
String[] splits = refAuthorString.split(","); |
|
1260 |
if (splits.length == 2){ |
|
1261 |
String newMatch = splits[1].trim() + " " + splits[0].trim(); |
|
1262 |
if (newMatch.equals(nomTitle)){ |
|
1263 |
if (isBlank(person.getFamilyName())){ |
|
1264 |
person.setFamilyName(splits[0].trim()); |
|
1265 |
} |
|
1266 |
if (isBlank(person.getInitials())){ |
|
1267 |
person.setInitials(splits[1].trim()); |
|
1268 |
} |
|
1269 |
return true; |
|
1108 | 1270 |
} |
1109 |
return true; |
|
1110 | 1271 |
} |
1111 | 1272 |
} |
1273 |
|
|
1274 |
if (refAuthorIsFamilyAuthorOfNomAuthor(state, refAuthorString, person)){ |
|
1275 |
return true; |
|
1276 |
} |
|
1277 |
|
|
1278 |
if (firstIsFullNameOfInitialName(state, refAuthorString, person, refId)){ |
|
1279 |
return true; |
|
1280 |
} |
|
1281 |
} |
|
1282 |
|
|
1283 |
} |
|
1284 |
return false; |
|
1285 |
} |
|
1286 |
|
|
1287 |
/** |
|
1288 |
* @param state |
|
1289 |
* @param refAuthorString |
|
1290 |
* @param person |
|
1291 |
* @return |
|
1292 |
*/ |
|
1293 |
private static boolean refAuthorIsFamilyAuthorOfNomAuthor(BerlinModelImportState state, String refAuthorString, |
|
1294 |
Person person) { |
|
1295 |
if (refAuthorString.equals(person.getFamilyName())){ |
|
1296 |
return true; |
|
1297 |
}else{ |
|
1298 |
return false; |
|
1299 |
} |
|
1300 |
} |
|
1301 |
|
|
1302 |
/** |
|
1303 |
* @param state |
|
1304 |
* @param refAuthorString |
|
1305 |
* @param nomAuthor |
|
1306 |
* @return |
|
1307 |
*/ |
|
1308 |
private static boolean firstIsFullNameOfInitialName(BerlinModelImportState state, String fullName, |
|
1309 |
Person initialAuthor, int refId) { |
|
1310 |
String initialName = initialAuthor.getTitleCache(); |
|
1311 |
|
|
1312 |
String[] fullSplits = fullName.split(","); |
|
1313 |
String[] initialSplits = initialName.split(","); |
|
1314 |
|
|
1315 |
if (fullSplits.length == 2 && initialSplits.length == 2){ |
|
1316 |
String[] fullGivenName = fullSplits[1].trim().split(" "); |
|
1317 |
String[] initialsGivenName = initialSplits[1].trim().split(" "); |
|
1318 |
boolean result = compareFamilyAndInitials(fullSplits[0], initialSplits[0], fullGivenName, initialsGivenName); |
|
1319 |
if (result){ |
|
1320 |
setGivenName(state, fullSplits[1], initialAuthor, refId); |
|
1321 |
} |
|
1322 |
return result; |
|
1323 |
}else if (fullSplits.length == 1 && initialSplits.length == 2){ |
|
1324 |
String[] fullSingleSplits = fullName.split(" "); |
|
1325 |
String fullFamily = fullSingleSplits[fullSingleSplits.length-1]; |
|
1326 |
String[] fullGivenName = Arrays.copyOfRange(fullSingleSplits, 0, fullSingleSplits.length-1); |
|
1327 |
String[] initialsGivenName = initialSplits[1].trim().split(" "); |
|
1328 |
boolean result = compareFamilyAndInitials(fullFamily, initialSplits[0], fullGivenName, initialsGivenName); |
|
1329 |
if (result){ |
|
1330 |
if(hasAtLeastOneFullName(fullGivenName)){ |
|
1331 |
setGivenName(state, CdmUtils.concat(" ", fullGivenName), initialAuthor, refId); |
|
1332 |
} |
|
1333 |
} |
|
1334 |
return result; |
|
1335 |
}else if (fullSplits.length == 1 && initialAuthor.getInitials() == null){ |
|
1336 |
//don't if this will be implemented, initialAuthors with only nomencl.Author set |
|
1337 |
} |
|
1338 |
|
|
1339 |
return false; |
|
1340 |
} |
|
1341 |
|
|
1342 |
/** |
|
1343 |
* @param fullGivenName |
|
1344 |
* @return |
|
1345 |
*/ |
|
1346 |
private static boolean hasAtLeastOneFullName(String[] fullGivenName) { |
|
1347 |
for (String singleName : fullGivenName){ |
|
1348 |
if (!singleName.endsWith(".") && singleName.length() > 2 && !singleName.matches("(von|van)") ){ |
|
1349 |
return true; |
|
1112 | 1350 |
} |
1113 | 1351 |
} |
1114 | 1352 |
return false; |
1115 | 1353 |
} |
1116 | 1354 |
|
1117 | 1355 |
/** |
1356 |
* @param state |
|
1357 |
* @param string |
|
1358 |
* @param initialAuthor |
|
1359 |
*/ |
|
1360 |
private static void setGivenName(BerlinModelImportState state, String givenName, Person person, int refId) { |
|
1361 |
givenName = givenName.trim(); |
|
1362 |
if(person.getGivenName() == null || person.getGivenName().equals(givenName)){ |
|
1363 |
person.setGivenName(givenName); |
|
1364 |
}else{ |
|
1365 |
logger.warn("RefAuthor given name and existing given name differ: " + givenName + " <-> " + person.getGivenName() + "; RefId + " + refId); |
|
1366 |
} |
|
1367 |
} |
|
1368 |
|
|
1369 |
/** |
|
1370 |
* @param fullGivenName |
|
1371 |
* @param initialsGivenName |
|
1372 |
*/ |
|
1373 |
protected static boolean compareFamilyAndInitials(String fullFamilyName, String initialsFamilyName, |
|
1374 |
String[] fullGivenName, String[] initialsGivenName) { |
|
1375 |
if (!fullFamilyName.equals(initialsFamilyName)){ |
|
1376 |
return false; |
|
1377 |
} |
|
1378 |
if (fullGivenName.length == initialsGivenName.length){ |
|
1379 |
for (int i =0; i< fullGivenName.length ; i++){ |
|
1380 |
if (fullGivenName[i].length() == 0 //comma ending not allowed |
|
1381 |
|| initialsGivenName[i].length() != 2 //only K. or similar allowed |
|
1382 |
|| fullGivenName[i].length() < initialsGivenName[i].length() //fullFirstName must be longer than abbrev Name |
|
1383 |
|| !initialsGivenName[i].endsWith(".") //initials must end with "." |
|
1384 |
|| !fullGivenName[i].startsWith(initialsGivenName[i].replace(".", ""))){ //start with same letter |
|
1385 |
if (fullGivenName[i].matches("(von|van|de|zu)") && fullGivenName[i].equals(initialsGivenName[i])){ |
|
1386 |
continue; |
|
1387 |
}else{ |
|
1388 |
return false; |
|
1389 |
} |
|
1390 |
} |
|
1391 |
} |
|
1392 |
return true; |
|
1393 |
}else{ |
|
1394 |
return false; |
|
1395 |
} |
|
1396 |
} |
|
1397 |
|
|
1398 |
/** |
|
1118 | 1399 |
* @param lowerCase |
1119 | 1400 |
* @param config |
1120 | 1401 |
* @return |
Also available in: Unified diff
ref #7801 and ref #3787 deduplicate reference.authorstring and reference itself