Revision f4b9ac06
Added by Andreas Müller over 5 years ago
app-import/src/main/java/eu/etaxonomy/cdm/io/berlinModel/in/BerlinModelReferenceImport.java | ||
---|---|---|
57 | 57 |
import eu.etaxonomy.cdm.io.common.mapping.berlinModel.CdmOneToManyMapper; |
58 | 58 |
import eu.etaxonomy.cdm.io.common.mapping.berlinModel.CdmStringMapper; |
59 | 59 |
import eu.etaxonomy.cdm.io.common.mapping.berlinModel.CdmUriMapper; |
60 |
import eu.etaxonomy.cdm.io.common.utils.ImportDeduplicationHelper; |
|
61 |
import eu.etaxonomy.cdm.model.agent.Person; |
|
60 | 62 |
import eu.etaxonomy.cdm.model.agent.Team; |
61 | 63 |
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase; |
62 | 64 |
import eu.etaxonomy.cdm.model.common.CdmBase; |
... | ... | |
69 | 71 |
import eu.etaxonomy.cdm.model.reference.IPrintSeries; |
70 | 72 |
import eu.etaxonomy.cdm.model.reference.Reference; |
71 | 73 |
import eu.etaxonomy.cdm.model.reference.ReferenceFactory; |
74 |
import eu.etaxonomy.cdm.strategy.cache.agent.TeamDefaultCacheStrategy; |
|
72 | 75 |
|
73 | 76 |
/** |
74 | 77 |
* @author a.mueller |
... | ... | |
89 | 92 |
public static final UUID DATE_STRING_UUID = UUID.fromString("e4130eae-606e-4b0c-be4f-e93dc161be7d"); |
90 | 93 |
public static final UUID IS_PAPER_UUID = UUID.fromString("8a326129-d0d0-4f9d-bbdf-8d86b037c65e"); |
91 | 94 |
|
95 |
private static ImportDeduplicationHelper<BerlinModelImportState> deduplicationHelper; |
|
92 | 96 |
|
93 | 97 |
private final int modCount = 1000; |
94 | 98 |
private static final String pluralString = "references"; |
... | ... | |
179 | 183 |
@Override |
180 | 184 |
protected void doInvoke(BerlinModelImportState state){ |
181 | 185 |
logger.info("start make " + getPluralString() + " ..."); |
186 |
deduplicationHelper = ImportDeduplicationHelper.NewInstance(this, state); |
|
182 | 187 |
|
183 | 188 |
boolean success = true; |
184 | 189 |
initializeMappers(state); |
... | ... | |
251 | 256 |
if (! success){ |
252 | 257 |
state.setUnsuccessfull(); |
253 | 258 |
} |
259 |
deduplicationHelper = null; |
|
254 | 260 |
return; |
255 | 261 |
} |
256 | 262 |
|
... | ... | |
399 | 405 |
|
400 | 406 |
//team map |
401 | 407 |
nameSpace = BerlinModelAuthorTeamImport.NAMESPACE; |
402 |
cdmClass = Team.class; |
|
408 |
cdmClass = TeamOrPersonBase.class;
|
|
403 | 409 |
idSet = teamIdSet; |
404 | 410 |
@SuppressWarnings("unchecked") |
405 | 411 |
Map<String, Team> teamMap = (Map<String, Team>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace); |
... | ... | |
407 | 413 |
|
408 | 414 |
//refAuthor map |
409 | 415 |
nameSpace = REF_AUTHOR_NAMESPACE; |
410 |
cdmClass = Team.class; |
|
416 |
cdmClass = TeamOrPersonBase.class;
|
|
411 | 417 |
idSet = teamStringSet2; |
412 | 418 |
@SuppressWarnings("unchecked") |
413 | 419 |
Map<String, Team> refAuthorMap = (Map<String, Team>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace); |
... | ... | |
550 | 556 |
String nomTitleAbbrev = rs.getString("nomTitleAbbrev"); |
551 | 557 |
boolean isPreliminary = rs.getBoolean("PreliminaryFlag"); |
552 | 558 |
String refAuthorString = rs.getString("refAuthorString"); |
553 |
Integer nomAuthorTeamFk = rs.getInt("NomAuthorTeamFk"); |
|
554 |
String strNomAuthorTeamFk = String.valueOf(nomAuthorTeamFk); |
|
555 |
TeamOrPersonBase<?> nomAuthor = teamMap.get(strNomAuthorTeamFk); |
|
559 |
Integer nomAuthorTeamFk = nullSafeInt(rs, "NomAuthorTeamFk"); |
|
560 |
|
|
561 |
TeamOrPersonBase<?> nomAuthor = null; |
|
562 |
if (nomAuthorTeamFk != null){ |
|
563 |
String strNomAuthorTeamFk = String.valueOf(nomAuthorTeamFk); |
|
564 |
nomAuthor = teamMap.get(strNomAuthorTeamFk); |
|
565 |
if (nomAuthor == null){ |
|
566 |
logger.warn("NomAuthor ("+strNomAuthorTeamFk+") not found in teamMap for " + refId); |
|
567 |
} |
|
568 |
} |
|
556 | 569 |
|
557 | 570 |
Reference sourceReference = state.getTransactionalSourceReference(); |
558 | 571 |
|
... | ... | |
571 | 584 |
} |
572 | 585 |
|
573 | 586 |
//author |
574 |
TeamOrPersonBase<?> author = getAuthorship(state, refAuthorString, nomAuthor); |
|
587 |
TeamOrPersonBase<?> author = getAuthorship(state, refAuthorString, nomAuthor, refId);
|
|
575 | 588 |
ref.setAuthorship(author); |
576 | 589 |
|
577 | 590 |
//save |
... | ... | |
899 | 912 |
} |
900 | 913 |
|
901 | 914 |
|
902 |
private static TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String authorString, TeamOrPersonBase<?> nomAuthor){ |
|
915 |
private static TeamOrPersonBase<?> getAuthorship(BerlinModelImportState state, String refAuthorString, |
|
916 |
TeamOrPersonBase<?> nomAuthor, Integer refId){ |
|
903 | 917 |
|
904 | 918 |
TeamOrPersonBase<?> result; |
905 | 919 |
if (nomAuthor != null){ |
906 | 920 |
result = nomAuthor; |
907 |
} else if (StringUtils.isNotBlank(authorString)){ |
|
921 |
if (isNotBlank(refAuthorString) && !nomAuthor.getTitleCache().equals(refAuthorString)){ |
|
922 |
boolean isSimilar = handleSimilarAuthors(state, refAuthorString, nomAuthor); |
|
923 |
if (! isSimilar){ |
|
924 |
logger.warn("refAuthorString differs from nomAuthor.titleCache: " + refAuthorString |
|
925 |
+ " <-> " + nomAuthor.getTitleCache() + "; RefId: " + refId); |
|
926 |
} |
|
927 |
} |
|
928 |
|
|
929 |
} else if (isNotBlank(refAuthorString)){ |
|
930 |
refAuthorString = refAuthorString.trim(); |
|
908 | 931 |
//TODO match with existing Persons/Teams |
909 |
Team team = state.getRelatedObject(REF_AUTHOR_NAMESPACE, authorString, Team.class); |
|
910 |
if (team == null){ |
|
911 |
team = Team.NewInstance(); |
|
912 |
team.setNomenclaturalTitle(authorString); |
|
913 |
team.setTitleCache(authorString, true); |
|
914 |
state.addRelatedObject(REF_AUTHOR_NAMESPACE, authorString, team); |
|
915 |
team.addImportSource(authorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
932 |
TeamOrPersonBase<?> author = state.getRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, TeamOrPersonBase.class); |
|
933 |
if (author == null){ |
|
934 |
if (!BerlinModelAuthorTeamImport.hasTeamSeparator(refAuthorString)){ |
|
935 |
author = makePerson(refAuthorString, refId); |
|
936 |
}else{ |
|
937 |
author = makeTeam(state, refAuthorString, refId); |
|
938 |
} |
|
939 |
state.addRelatedObject(REF_AUTHOR_NAMESPACE, refAuthorString, author); |
|
940 |
author.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
916 | 941 |
} |
917 |
result = team;
|
|
942 |
result = author;
|
|
918 | 943 |
}else{ |
919 | 944 |
result = null; |
920 | 945 |
} |
... | ... | |
922 | 947 |
return result; |
923 | 948 |
} |
924 | 949 |
|
950 |
/** |
|
951 |
* @param state |
|
952 |
* @param refAuthorString |
|
953 |
* @param refId |
|
954 |
* @return |
|
955 |
*/ |
|
956 |
private static Team makeTeam(BerlinModelImportState state, String refAuthorString, Integer refId) { |
|
957 |
Team team = Team.NewInstance(); |
|
958 |
if (containsEdOrColon(refAuthorString)){ |
|
959 |
team.setTitleCache(refAuthorString, true); |
|
960 |
}else{ |
|
961 |
String[] fullTeams = BerlinModelAuthorTeamImport.splitTeam(refAuthorString); |
|
962 |
boolean lastWasInitials = false; |
|
963 |
for (int i = 0; i< fullTeams.length ;i++){ |
|
964 |
if (lastWasInitials){ |
|
965 |
lastWasInitials = false; |
|
966 |
continue; |
|
967 |
} |
|
968 |
String fullTeam = fullTeams[i].trim(); |
|
969 |
String initials = null; |
|
970 |
if (fullTeams.length > i+1){ |
|
971 |
String nextSplit = fullTeams[i+1].trim(); |
|
972 |
if (isInitial(nextSplit)){ |
|
973 |
lastWasInitials = true; |
|
974 |
initials = nextSplit; |
|
975 |
} |
|
976 |
} |
|
977 |
Person member = makePerson(fullTeam, refId); |
|
978 |
|
|
979 |
if (initials != null && !member.isProtectedTitleCache()){ |
|
980 |
member.setInitials(initials); |
|
981 |
}else if (initials != null){ |
|
982 |
member.setTitleCache(member.getTitleCache() + ", " + initials, true); |
|
983 |
} |
|
984 |
|
|
985 |
if (i == fullTeams.length -1 && BerlinModelAuthorTeamImport.isEtAl(member)){ |
|
986 |
team.setHasMoreMembers(true); |
|
987 |
}else{ |
|
988 |
Person dedupMember = deduplicatePerson(state, member); |
|
989 |
if (dedupMember != member){ |
|
990 |
logger.debug("Member deduplicated: " + refId); |
|
991 |
}else{ |
|
992 |
member.addImportSource(refAuthorString, REF_AUTHOR_NAMESPACE, state.getTransactionalSourceReference(), null); |
|
993 |
} |
|
994 |
//TODO add idInBM |
|
995 |
team.addTeamMember(dedupMember); |
|
996 |
} |
|
997 |
} |
|
998 |
} |
|
925 | 999 |
|
926 |
/** |
|
1000 |
TeamDefaultCacheStrategy formatter = (TeamDefaultCacheStrategy) team.getCacheStrategy(); |
|
1001 |
formatter.setEtAlPosition(100); |
|
1002 |
if (formatter.getTitleCache(team).equals(refAuthorString)){ |
|
1003 |
team.setProtectedTitleCache(false); |
|
1004 |
}else if(formatter.getTitleCache(team).replace(" & ", ", ").equals(refAuthorString.replace(" & ", ", ").replace(" ,", ","))){ |
|
1005 |
//also accept teams with ', ' as final member separator as not protected |
|
1006 |
team.setProtectedTitleCache(false); |
|
1007 |
}else if(formatter.getFullTitle(team).replace(" & ", ", ").equals(refAuthorString.replace(" & ", ", "))){ |
|
1008 |
//.. or teams with initials first |
|
1009 |
team.setProtectedTitleCache(false); |
|
1010 |
}else if (containsEdOrColon(refAuthorString)){ |
|
1011 |
//nothing to do, it is expected to be protected |
|
1012 |
}else{ |
|
1013 |
team.setTitleCache(refAuthorString, true); |
|
1014 |
logger.warn("Creation of titleCache for team with members did not (fully) work: " + refAuthorString + " <-> " + formatter.getTitleCache(team)+ " : " + refId); |
|
1015 |
} |
|
1016 |
return team; |
|
1017 |
} |
|
1018 |
|
|
1019 |
/** |
|
1020 |
* @param refAuthorString |
|
1021 |
* @return |
|
1022 |
*/ |
|
1023 |
private static boolean containsEdOrColon(String str) { |
|
1024 |
if (str.contains(" ed.") || str.contains(" Ed.") || str.contains("(ed.") |
|
1025 |
|| str.contains("[ed.") || str.contains("(Eds)") || str.contains("(Eds.)") || |
|
1026 |
str.contains("(eds.)") || str.contains(":")|| str.contains(";")){ |
|
1027 |
return true; |
|
1028 |
}else{ |
|
1029 |
return false; |
|
1030 |
} |
|
1031 |
} |
|
1032 |
|
|
1033 |
/** |
|
1034 |
* @param nextSplit |
|
1035 |
* @return |
|
1036 |
*/ |
|
1037 |
private static boolean isInitial(String str) { |
|
1038 |
if (str == null){ |
|
1039 |
return false; |
|
1040 |
} |
|
1041 |
boolean matches = str.trim().matches("(\\p{javaUpperCase}|Yu|Th|Ch|Lj|Sz|Dz|Sh)\\.?(\\s*[-\\s]\\s*(\\p{javaUpperCase}|Yu)\\.?)*(\\s+(van|von))?"); |
|
1042 |
return matches; |
|
1043 |
} |
|
1044 |
|
|
1045 |
private static Person deduplicatePerson(BerlinModelImportState state, Person person) { |
|
1046 |
Person result = deduplicationHelper.getExistingAuthor(state, person); |
|
1047 |
return result; |
|
1048 |
} |
|
1049 |
|
|
1050 |
private static Person makePerson(String full, Integer refId) { |
|
1051 |
Person person = Person.NewInstance(); |
|
1052 |
person.setTitleCache(full, true); |
|
1053 |
if (!full.matches(".*[\\s\\.].*")){ |
|
1054 |
person.setFamilyName(full); |
|
1055 |
person.setProtectedTitleCache(false); |
|
1056 |
}else if (full.matches("(\\p{javaUpperCase}|Kh)\\.(\\s\\p{javaUpperCase}\\.)*\\s\\p{javaUpperCase}\\p{javaLowerCase}{2,}")){ |
|
1057 |
String[] splits = full.split("\\s"); |
|
1058 |
person.setFamilyName(splits[splits.length-1]); |
|
1059 |
String initials = splits[0]; |
|
1060 |
for (int i = 1; i < splits.length -1; i++ ){ |
|
1061 |
initials += " " + splits[i]; |
|
1062 |
} |
|
1063 |
person.setInitials(initials); |
|
1064 |
person.setProtectedTitleCache(false); |
|
1065 |
} |
|
1066 |
if ((full.length() <= 2 && !full.matches("(Li|Bo|Em|Ay|Ma)")) || (full.length() == 3 && full.endsWith(".") && !full.equals("al.")) ){ |
|
1067 |
// if (!full.matches("((L|Sm|DC|al|Sw|Qz|Fr|Ib)\\.|Hu|Ma|Hy|Wu)")){ |
|
1068 |
logger.warn("Unexpected short nom author name part: " + full + "; " + refId); |
|
1069 |
// } |
|
1070 |
} |
|
1071 |
|
|
1072 |
return person; |
|
1073 |
} |
|
1074 |
|
|
1075 |
/** |
|
1076 |
* @param state |
|
1077 |
* @param refAuthorString |
|
1078 |
* @param nomAuthor |
|
1079 |
* @return |
|
1080 |
*/ |
|
1081 |
private static boolean handleSimilarAuthors(BerlinModelImportState state, String refAuthorString, |
|
1082 |
TeamOrPersonBase<?> nomAuthor) { |
|
1083 |
if (refAuthorString.equals(nomAuthor.getNomenclaturalTitle())){ |
|
1084 |
//nomTitle equal |
|
1085 |
return true; |
|
1086 |
}else{ |
|
1087 |
String nomTitle = nomAuthor.getTitleCache(); |
|
1088 |
if (refAuthorString.replace(" & ", ", ").equals(nomTitle.replace(" & ", ", "))){ |
|
1089 |
//nomTitle equal except for "&" |
|
1090 |
return true; |
|
1091 |
} |
|
1092 |
|
|
1093 |
if (refAuthorString.replace(" & ", ", ").equals(nomAuthor.getFullTitle().replace(" & ", ", "))){ |
|
1094 |
return true; |
|
1095 |
} |
|
1096 |
|
|
1097 |
if (refAuthorString.contains(",") && !nomTitle.contains(",") && nomAuthor.isInstanceOf(Person.class)){ |
|
1098 |
String[] splits = refAuthorString.split(","); |
|
1099 |
Person person = CdmBase.deproxy(nomAuthor, Person.class); |
|
1100 |
if (splits.length == 2){ |
|
1101 |
String newMatch = splits[1].trim() + " " + splits[0].trim(); |
|
1102 |
if (newMatch.equals(nomTitle)){ |
|
1103 |
if (isBlank(person.getFamilyName())){ |
|
1104 |
person.setFamilyName(splits[0].trim()); |
|
1105 |
} |
|
1106 |
if (isBlank(person.getInitials())){ |
|
1107 |
person.setInitials(splits[1].trim()); |
|
1108 |
} |
|
1109 |
return true; |
|
1110 |
} |
|
1111 |
} |
|
1112 |
} |
|
1113 |
} |
|
1114 |
return false; |
|
1115 |
} |
|
1116 |
|
|
1117 |
/** |
|
927 | 1118 |
* @param lowerCase |
928 | 1119 |
* @param config |
929 | 1120 |
* @return |
Also available in: Unified diff
ref #7799 implement author parsing for AuthorTeam and Reference.RefAuthorString