14 |
14 |
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
|
15 |
15 |
import eu.etaxonomy.cdm.model.common.*;
|
16 |
16 |
import eu.etaxonomy.cdm.model.name.*;
|
17 |
|
import eu.etaxonomy.cdm.model.reference.INomenclaturalReference;
|
18 |
17 |
import eu.etaxonomy.cdm.model.reference.Reference;
|
19 |
18 |
import eu.etaxonomy.cdm.model.taxon.*;
|
20 |
|
import eu.etaxonomy.cdm.strategy.exceptions.StringNotParsableException;
|
21 |
|
import eu.etaxonomy.cdm.strategy.parser.INonViralNameParser;
|
22 |
19 |
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
|
|
20 |
import org.apache.commons.lang.StringEscapeUtils;
|
23 |
21 |
import org.apache.commons.lang.StringUtils;
|
|
22 |
import org.apache.log4j.Level;
|
24 |
23 |
import org.apache.log4j.Logger;
|
25 |
24 |
import org.springframework.stereotype.Component;
|
26 |
25 |
|
27 |
26 |
import java.util.*;
|
|
27 |
import java.util.regex.Matcher;
|
|
28 |
import java.util.regex.Pattern;
|
28 |
29 |
|
29 |
30 |
/**
|
30 |
31 |
* @author a.mueller
|
31 |
32 |
* @created 05.01.2016
|
32 |
33 |
*/
|
33 |
34 |
|
34 |
|
@Component
|
|
35 |
@Component("iAPTExcelImport")
|
35 |
36 |
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
|
36 |
37 |
private static final long serialVersionUID = -747486709409732371L;
|
37 |
38 |
private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
|
|
39 |
public static final String ANNOTATION_MARKER_STRING = "[*]";
|
38 |
40 |
|
39 |
41 |
|
40 |
42 |
private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
|
41 |
43 |
|
42 |
|
private static INonViralNameParser<?> nameParser = NonViralNameParserImpl.NewInstance();
|
|
44 |
private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
|
43 |
45 |
|
44 |
46 |
private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
|
45 |
47 |
private final static String HIGHERTAXON= "HigherTaxon";
|
... | ... | |
61 |
63 |
private static List<String> expectedKeys= Arrays.asList(new String[]{
|
62 |
64 |
REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
|
63 |
65 |
|
|
66 |
private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$");
|
|
67 |
private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999
|
64 |
68 |
|
65 |
69 |
private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
|
66 |
70 |
TaxonNode higherTaxonNode, boolean isSynonym) {
|
67 |
71 |
|
68 |
72 |
String line = state.getCurrentLine() + ": ";
|
69 |
73 |
|
70 |
|
String fullNameStr = getValue(record, FULLNAME);
|
71 |
|
String nameStr = getValue(record, NAMESTRING);
|
72 |
|
String authorStr = getValue(record, AUTHORSTRING);
|
|
74 |
String titleCacheStr = getValue(record, FULLNAME, true);
|
|
75 |
String nameStr = getValue(record, NAMESTRING, true);
|
|
76 |
String authorStr = getValue(record, AUTHORSTRING, true);
|
|
77 |
String nomRefStr = getValue(record, LITSTRING, true);
|
|
78 |
|
|
79 |
String nomRefTitle = null;
|
|
80 |
String nomRefDetail = null;
|
|
81 |
String nomRefPupDate = null;
|
|
82 |
String nomRefPupYear = null;
|
|
83 |
|
|
84 |
// preprocess nomRef: separate citation, reference detail, publishing date
|
|
85 |
if(!StringUtils.isEmpty(nomRefStr)){
|
|
86 |
nomRefStr = nomRefStr.trim();
|
|
87 |
Matcher m = nomRefTokenizeP.matcher(nomRefStr);
|
|
88 |
if(m.matches()){
|
|
89 |
nomRefTitle = m.group(1);
|
|
90 |
nomRefDetail = m.group(2);
|
|
91 |
nomRefPupDate = m.group(3);
|
|
92 |
|
|
93 |
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
|
|
94 |
Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate);
|
|
95 |
if(m2.matches()){
|
|
96 |
nomRefPupYear = m2.group(2);
|
|
97 |
if(nomRefPupYear.length() == 2 ){
|
|
98 |
// it is an abbreviated year from the 19** years
|
|
99 |
nomRefPupYear = "19" + nomRefPupYear;
|
|
100 |
}
|
|
101 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
|
|
102 |
} else {
|
|
103 |
logger.warn("Pub year not found in " + nomRefStr );
|
|
104 |
}
|
|
105 |
|
|
106 |
} else {
|
|
107 |
nomRefTitle = nomRefStr;
|
|
108 |
}
|
|
109 |
}
|
|
110 |
|
|
111 |
|
|
112 |
BotanicalName taxonName;
|
|
113 |
Map<String, AnnotationType> nameAnnotations = new HashMap<>();
|
73 |
114 |
|
74 |
|
String sourceReference = getValue(record, LITSTRING);
|
|
115 |
if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) && authorStr.endsWith(ANNOTATION_MARKER_STRING)){
|
|
116 |
nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
|
|
117 |
titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
|
|
118 |
authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
|
|
119 |
}
|
|
120 |
|
|
121 |
if(!StringUtils.isEmpty(nomRefTitle)){
|
|
122 |
String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
|
|
123 |
String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
|
|
124 |
logger.debug(":::::" + taxonFullNameStr);
|
|
125 |
taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
|
|
126 |
} else {
|
|
127 |
taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
|
|
128 |
}
|
75 |
129 |
|
76 |
|
BotanicalName taxonName = (BotanicalName) nameParser.parseFullName(fullNameStr, NomenclaturalCode.ICNAFP, null);
|
77 |
130 |
if (taxonName.isProtectedTitleCache()) {
|
78 |
|
logger.warn(line + "Name could not be parsed: " + fullNameStr);
|
|
131 |
logger.warn(line + "Name could not be parsed: " + titleCacheStr);
|
79 |
132 |
} else {
|
|
133 |
|
|
134 |
boolean doRestoreTitleCacheStr = false;
|
|
135 |
// Check titleCache
|
|
136 |
String generatedTitleCache = taxonName.getTitleCache();
|
|
137 |
if (!generatedTitleCache.trim().equals(titleCacheStr)) {
|
|
138 |
logger.warn(line + "The generated titleCache differs from the imported string : " + generatedTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
|
|
139 |
doRestoreTitleCacheStr = true;
|
|
140 |
}
|
80 |
141 |
// Check Name
|
81 |
|
if (!taxonName.getNameCache().equals(nameStr)) {
|
|
142 |
if (!taxonName.getNameCache().trim().equals(nameStr)) {
|
82 |
143 |
logger.warn(line + "parsed nameCache differs from " + NAMESTRING + " : " + taxonName.getNameCache() + " <> " + nameStr);
|
83 |
144 |
}
|
84 |
|
// Check Author
|
85 |
|
INomenclaturalReference nomRef = taxonName.getNomenclaturalReference();
|
86 |
|
if (!nomRef.getAuthorship().getTitleCache().equals(authorStr)) {
|
87 |
|
logger.warn(line + "parsed nomRef.authorship differs from " + AUTHORSTRING + " : " + nomRef.getAuthorship().getTitleCache() + " <> " + authorStr);
|
88 |
|
// preserve current titleCache
|
89 |
|
taxonName.setProtectedTitleCache(true);
|
90 |
|
try {
|
91 |
|
nameParser.parseAuthors(taxonName, authorStr);
|
92 |
|
} catch (StringNotParsableException e) {
|
93 |
|
logger.error(" " + authorStr + " can not be parsed");
|
94 |
|
}
|
|
145 |
|
|
146 |
// Author
|
|
147 |
//nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
|
|
148 |
//if (!titleCacheStr.equals(taxonName.getTitleCache())) {
|
|
149 |
// logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
|
|
150 |
// doRestoreTitleCacheStr = true;
|
|
151 |
//}
|
|
152 |
|
|
153 |
if(doRestoreTitleCacheStr){
|
|
154 |
taxonName.setTitleCache(titleCacheStr, true);
|
95 |
155 |
}
|
96 |
156 |
|
97 |
157 |
// deduplicate
|
98 |
158 |
replaceAuthorNamesAndNomRef(state, taxonName);
|
|
159 |
|
|
160 |
// Annotations
|
|
161 |
if(!nameAnnotations.isEmpty()){
|
|
162 |
for(String text : nameAnnotations.keySet()){
|
|
163 |
taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
|
|
164 |
}
|
|
165 |
getNameService().save(taxonName);
|
|
166 |
}
|
99 |
167 |
}
|
100 |
168 |
|
101 |
169 |
Reference sec = state.getConfig().getSecReference();
|
... | ... | |
150 |
218 |
/**
|
151 |
219 |
* @param record
|
152 |
220 |
* @param originalKey
|
|
221 |
* @param doUnescapeHtmlEntities
|
153 |
222 |
* @return
|
154 |
223 |
*/
|
155 |
|
private String getValue(HashMap<String, String> record, String originalKey) {
|
|
224 |
private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
|
156 |
225 |
String value = record.get(originalKey);
|
157 |
226 |
if (! StringUtils.isBlank(value)) {
|
158 |
|
if (logger.isDebugEnabled()) { logger.debug(originalKey + ": " + value); }
|
|
227 |
if (logger.isDebugEnabled()) {
|
|
228 |
logger.debug(originalKey + ": " + value);
|
|
229 |
}
|
159 |
230 |
value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
|
160 |
|
return value;
|
|
231 |
if(doUnescapeHtmlEntities){
|
|
232 |
value = StringEscapeUtils.unescapeHtml(value);
|
|
233 |
}
|
|
234 |
return value.trim();
|
161 |
235 |
}else{
|
162 |
236 |
return null;
|
163 |
237 |
}
|
... | ... | |
174 |
248 |
boolean isSynonymOnly = false;
|
175 |
249 |
|
176 |
250 |
String line = state.getCurrentLine() + ": ";
|
|
251 |
logger.setLevel(Level.DEBUG);
|
177 |
252 |
HashMap<String, String> record = state.getOriginalRecord();
|
|
253 |
logger.debug(record.toString());
|
178 |
254 |
|
179 |
255 |
Set<String> keys = record.keySet();
|
180 |
256 |
for (String key: keys) {
|
... | ... | |
183 |
259 |
}
|
184 |
260 |
}
|
185 |
261 |
|
|
262 |
String reg_id = record.get(REGISTRATIONNO_PK);
|
186 |
263 |
//higherTaxon
|
187 |
264 |
TaxonNode higherTaxon = getHigherTaxon(record, (IAPTImportState)state);
|
188 |
265 |
|
... | ... | |
219 |
296 |
|
220 |
297 |
ITaxonTreeNode rootNode = getClassification(state);
|
221 |
298 |
for (String htn : higherTaxaNames) {
|
222 |
|
htn = htn.trim();
|
|
299 |
htn = StringUtils.capitalize(htn.trim());
|
223 |
300 |
Taxon higherTaxon = state.getHigherTaxon(htn);
|
224 |
301 |
if (higherTaxon != null){
|
225 |
302 |
higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
|
... | ... | |
227 |
304 |
BotanicalName name = makeHigherTaxonName(state, htn);
|
228 |
305 |
Reference sec = state.getSecReference();
|
229 |
306 |
higherTaxon = Taxon.NewInstance(name, sec);
|
|
307 |
getTaxonService().save(higherTaxon);
|
230 |
308 |
higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
|
231 |
309 |
state.putHigherTaxon(htn, higherTaxon);
|
232 |
|
rootNode = higherTaxonNode;
|
|
310 |
getClassificationService().saveTreeNode(higherTaxonNode);
|
233 |
311 |
}
|
|
312 |
rootNode = higherTaxonNode;
|
234 |
313 |
}
|
235 |
314 |
return higherTaxonNode;
|
236 |
315 |
}
|
237 |
316 |
|
238 |
317 |
private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
|
239 |
|
// Abteilung: -phyta (bei Pflanzen), -mycota (bei Pilzen)
|
240 |
|
// Unterabteilung: -phytina (bei Pflanzen), -mycotina (bei Pilzen)
|
241 |
|
// Klasse: -opsida (bei Pflanzen), -phyceae (bei Algen), -mycetes (bei Pilzen)
|
242 |
|
// Unterklasse: -idae (bei Pflanzen), -phycidae (bei Algen), -mycetidae (bei Pilzen)
|
243 |
|
// Ordnung: -ales
|
244 |
|
// Unterordnung: -ineae
|
245 |
|
// Familie: -aceae
|
246 |
|
// Unterfamilie: -oideae
|
247 |
|
// Tribus: -eae
|
248 |
|
// Subtribus: -inae
|
249 |
|
Rank rank = Rank.UNKNOWN_RANK();
|
250 |
|
if(name.matches("phyta$|mycota$")){
|
251 |
|
rank = Rank.SECTION_BOTANY();
|
252 |
|
} else if(name.matches("phytina$|mycotina$")){
|
253 |
|
rank = Rank.SUBSECTION_BOTANY();
|
254 |
|
} else if(name.matches("opsida$|phyceae$|mycetes$")){
|
255 |
|
rank = Rank.CLASS();
|
256 |
|
} else if(name.matches("idae$|phycidae$|mycetidae$")){
|
257 |
|
rank = Rank.SUBCLASS();
|
258 |
|
} else if(name.matches("ales$")){
|
259 |
|
rank = Rank.ORDER();
|
260 |
|
} else if(name.matches("ineae$")){
|
261 |
|
rank = Rank.SUBORDER();
|
262 |
|
} else if(name.matches("aceae$")){
|
263 |
|
rank = Rank.FAMILY();
|
264 |
|
} else if(name.matches("oideae$")){
|
265 |
|
rank = Rank.SUBFAMILY();
|
266 |
|
} else if(name.matches("eae$")){
|
267 |
|
rank = Rank.TRIBE();
|
268 |
|
} else if(name.matches("inae$")){
|
269 |
|
rank = Rank.SUBTRIBE();
|
270 |
|
}
|
|
318 |
|
|
319 |
Rank rank = guessRank(name);
|
271 |
320 |
|
272 |
321 |
BotanicalName taxonName = BotanicalName.NewInstance(rank);
|
273 |
322 |
taxonName.addSource(makeOriginalSource(state));
|
... | ... | |
275 |
324 |
return taxonName;
|
276 |
325 |
}
|
277 |
326 |
|
|
327 |
private Rank guessRank(String name) {
|
|
328 |
|
|
329 |
// normalize
|
|
330 |
name = name.replaceAll("\\(.*\\)", "").trim();
|
|
331 |
|
|
332 |
if(name.matches("^Plantae$|^Fungi$|^Musci$")){
|
|
333 |
return Rank.KINGDOM();
|
|
334 |
} else if(name.matches(".*incertae sedis$|^Fossil no group assigned$")){
|
|
335 |
return Rank.FAMILY();
|
|
336 |
} else if(name.matches(".*phyta$|.*mycota$")){
|
|
337 |
return Rank.SECTION_BOTANY();
|
|
338 |
} else if(name.matches(".*phytina$|.*mycotina$")){
|
|
339 |
return Rank.SUBSECTION_BOTANY();
|
|
340 |
} else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$")){
|
|
341 |
return Rank.CLASS();
|
|
342 |
} else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
|
|
343 |
return Rank.SUBCLASS();
|
|
344 |
} else if(name.matches(".*ales$")){
|
|
345 |
return Rank.ORDER();
|
|
346 |
} else if(name.matches(".*ineae$")){
|
|
347 |
return Rank.SUBORDER();
|
|
348 |
} else if(name.matches(".*oideae$")){
|
|
349 |
return Rank.SUBFAMILY();
|
|
350 |
} else if(name.matches(".*eae$")){
|
|
351 |
return Rank.TRIBE();
|
|
352 |
} else if(name.matches(".*inae$")){
|
|
353 |
return Rank.SUBTRIBE();
|
|
354 |
} else if(name.matches(".*ae$")){
|
|
355 |
return Rank.FAMILY();
|
|
356 |
}
|
|
357 |
return Rank.UNKNOWN_RANK();
|
|
358 |
}
|
|
359 |
|
278 |
360 |
|
279 |
361 |
/**
|
280 |
362 |
* @param state
|
#6026 import of taxa and names working