minor
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / markup / MarkupKeyImport.java
1 /**
2 * Copyright (C) 2009 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.markup;
11
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Set;
16
17 import javax.xml.stream.Location;
18 import javax.xml.stream.XMLEventReader;
19 import javax.xml.stream.XMLStreamException;
20 import javax.xml.stream.events.Attribute;
21 import javax.xml.stream.events.XMLEvent;
22
23 import org.apache.log4j.Logger;
24
25 import eu.etaxonomy.cdm.common.CdmUtils;
26 import eu.etaxonomy.cdm.io.markup.UnmatchedLeads.UnmatchedLeadsKey;
27 import eu.etaxonomy.cdm.model.common.CdmBase;
28 import eu.etaxonomy.cdm.model.common.Language;
29 import eu.etaxonomy.cdm.model.description.KeyStatement;
30 import eu.etaxonomy.cdm.model.description.PolytomousKey;
31 import eu.etaxonomy.cdm.model.description.PolytomousKeyNode;
32 import eu.etaxonomy.cdm.model.name.NonViralName;
33 import eu.etaxonomy.cdm.model.name.Rank;
34 import eu.etaxonomy.cdm.model.taxon.Taxon;
35 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
36
37 /**
38 * @author a.mueller
39 * @created 26.04.2013
40 */
41 public class MarkupKeyImport extends MarkupImportBase {
42 @SuppressWarnings("unused")
43 private static final Logger logger = Logger.getLogger(MarkupKeyImport.class);
44
45
46 public MarkupKeyImport(MarkupDocumentImport docImport) {
47 super(docImport);
48 }
49
50 public void handleKey(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent) throws XMLStreamException {
51 // attributes
52 Map<String, Attribute> attributes = getAttributes(parentEvent);
53 String isSpotcharacters = getAndRemoveAttributeValue(attributes, IS_SPOTCHARACTERS);
54 if (isNotBlank(isSpotcharacters) ) {
55 //TODO isSpotcharacters
56 String message = "Attribute isSpotcharacters not yet implemented for <key>";
57 fireWarningEvent(message, parentEvent, 4);
58 }
59 boolean onlyNumberedTaxaExist = checkAndRemoveAttributeValue(attributes, ONLY_NUMBERED_TAXA_EXIST, "true");
60 state.setOnlyNumberedTaxaExist(onlyNumberedTaxaExist);
61
62 PolytomousKey key = PolytomousKey.NewInstance();
63 key.addTaxonomicScope(state.getCurrentTaxon());
64 state.setCurrentKey(key);
65
66 boolean isFirstCouplet = true;
67 while (reader.hasNext()) {
68 XMLEvent next = readNoWhitespace(reader);
69 if (isMyEndingElement(next, parentEvent)) {
70 save(key, state);
71 //reset state
72 state.setCurrentKey(null);
73 state.setOnlyNumberedTaxaExist(false);
74 return;
75 } else if (isEndingElement(next, KEYNOTES)){
76 popUnimplemented(next.asEndElement());
77 } else if (isStartingElement(next, KEY_TITLE)) {
78 handleKeyTitle(state, reader, next);
79 } else if (isStartingElement(next, KEYNOTES)) {
80 //TODO
81 handleNotYetImplementedElement(next);
82 } else if (isStartingElement(next, COUPLET)) {
83 PolytomousKeyNode node = null;
84 if (isFirstCouplet){
85 node = key.getRoot();
86 isFirstCouplet = false;
87 }
88 handleCouplet(state, reader, next, node);
89 } else {
90 handleUnexpectedElement(next);
91 }
92 }
93 throw new IllegalStateException("<key> has no closing tag");
94 }
95
96
97 /**
98 * @param state
99 * @param reader
100 * @param key
101 * @param next
102 * @throws XMLStreamException
103 */
104 private void handleKeyTitle(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent) throws XMLStreamException {
105 PolytomousKey key = state.getCurrentKey();
106 String keyTitle = getCData(state, reader, parentEvent);
107 String standardTitlesEngl = "(?i)(Key\\sto\\sthe\\s(genera|species|varieties|forms))";
108 String standardTitlesFrench = "(?i)(Cl\u00e9\\sdes\\s(genres|esp\u00e8ces))";
109 String standardTitles = standardTitlesEngl;
110 if (state.getDefaultLanguage() != null && state.getDefaultLanguage().equals(Language.FRENCH())){
111 standardTitles = standardTitlesFrench;
112 }
113
114 if (isNotBlank(keyTitle) ){
115 if (!state.getConfig().isReplaceStandardKeyTitles() || ! keyTitle.matches(standardTitles)){
116 key.setTitleCache(keyTitle, true);
117 }
118 }
119 }
120
121
122 private void handleCouplet(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent, PolytomousKeyNode parentNode) throws XMLStreamException {
123 String num = getOnlyAttribute(parentEvent, NUM, true);
124 List<PolytomousKeyNode> childList = new ArrayList<PolytomousKeyNode>();
125
126 while (reader.hasNext()) {
127 XMLEvent next = readNoWhitespace(reader);
128 if (isMyEndingElement(next, parentEvent)) {
129 completeCouplet(state, parentEvent, parentNode, num, childList);
130 return;
131 } else if (next.isCharacters()){
132 handleNotYetImplementedCharacters(next);
133 //work in progress from pesiimport2, not sure if this works
134 // String mainQuestion = next.asCharacters().getData();
135 // mainQuestion = mainQuestion.replaceAll("\\s+", " ").trim();
136 // KeyStatement question = KeyStatement.NewInstance(mainQuestion);
137 // if (parentNode != null){ parentNode.setStatement(question);} //work in progress
138 } else if (isStartingElement(next, QUESTION)) {
139 handleQuestion(state, reader, next, childList);
140 } else if (isStartingElement(next, KEYNOTES)) {
141 //TODO
142 handleNotYetImplementedElement(next);
143 } else if (isEndingElement(next, KEYNOTES)) {
144 //TODO
145 popUnimplemented(next.asEndElement());
146 } else {
147 handleUnexpectedElement(next);
148 }
149 }
150 throw new IllegalStateException("<couplet> has no closing tag");
151 }
152
153
154 /**
155 * @param state
156 * @param parentEvent
157 * @param parentNode
158 * @param num
159 * @param childList
160 */
161 private void completeCouplet(MarkupImportState state, XMLEvent parentEvent,
162 PolytomousKeyNode parentNode, String num, List<PolytomousKeyNode> childList) {
163 if (parentNode != null){
164 for (PolytomousKeyNode childNode : childList){
165 parentNode.addChild(childNode);
166 //just to be on the save side
167 parentNode.refreshNodeNumbering();
168 }
169 }else if (isNotBlank(num)){
170 UnmatchedLeadsKey unmatchedKey = UnmatchedLeadsKey.NewInstance(state.getCurrentKey(), num);
171 Set<PolytomousKeyNode> nodes = state.getUnmatchedLeads().getNodes(unmatchedKey);
172 for(PolytomousKeyNode nodeToMatch: nodes){
173 for (PolytomousKeyNode childNode : childList){
174 nodeToMatch.addChild(childNode);
175 //just to be on the save side
176 nodeToMatch.refreshNodeNumbering();
177 }
178 state.getUnmatchedLeads().removeNode(unmatchedKey, nodeToMatch);
179 }
180 }else{
181 String message = "Parent num could not be matched. Please check if num (%s) is correct";
182 message = String.format(message, num);
183 fireWarningEvent(message, parentEvent, 6);
184 }
185 }
186
187 private void handleQuestion(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent, List<PolytomousKeyNode> nodesList) throws XMLStreamException {
188 // attributes
189 Map<String, Attribute> attributes = getAttributes(parentEvent);
190 //TODO needed only for data lineage
191 String questionNum = getAndRemoveRequiredAttributeValue(parentEvent, attributes, NUM);
192
193 PolytomousKeyNode myNode = PolytomousKeyNode.NewInstance();
194 myNode.setKey(state.getCurrentKey()); //to avoid NPE while computing num in PolytomousKeyNode in case this node is not matched correctly with a parent
195 nodesList.add(myNode);
196
197 while (reader.hasNext()) {
198 XMLEvent next = readNoWhitespace(reader);
199 if (isMyEndingElement(next, parentEvent)) {
200 return;
201 } else if (isStartingElement(next, TEXT)) {
202 String text = getCData(state, reader, next);
203 KeyStatement statement = KeyStatement.NewInstance(getDefaultLanguage(state), text);
204 myNode.setStatement(statement);
205 } else if (isStartingElement(next, COUPLET)) {
206 //TODO test
207 handleCouplet(state, reader, next, myNode);
208 } else if (isStartingElement(next, TO_COUPLET)) {
209 handleToCouplet(state, reader, next, myNode);
210 } else if (isStartingElement(next, TO_TAXON)) {
211 handleToTaxon(state, reader, next, myNode);
212 } else if (isStartingElement(next, TO_KEY)) {
213 //TODO
214 handleNotYetImplementedElement(next);
215 } else if (isEndingElement(next, TO_KEY)){
216 //TODO
217 popUnimplemented(next.asEndElement());
218 } else if (isStartingElement(next, KEYNOTES)) {
219 //TODO
220 handleNotYetImplementedElement(next);
221 } else if (isEndingElement(next, KEYNOTES)){
222 //TODO
223 popUnimplemented(next.asEndElement());
224 } else {
225 handleUnexpectedElement(next);
226 }
227 }
228 throw new IllegalStateException("<question> has no closing tag");
229 }
230
231 private void handleToCouplet(MarkupImportState state, XMLEventReader reader, XMLEvent next, PolytomousKeyNode node) throws XMLStreamException {
232 String num = getOnlyAttribute(next, NUM, true);
233 String cData = getCData(state, reader, next, false);
234 if (isNotBlank(cData) && ! cData.equals(num)){
235 String message = "CData ('%s') not handled in <toCouplet>";
236 message = String.format(message, cData);
237 fireWarningEvent(message, next, 4);
238 }
239 UnmatchedLeadsKey unmatched = UnmatchedLeadsKey.NewInstance(state.getCurrentKey(), num);
240 state.getUnmatchedLeads().addKey(unmatched, node);
241 }
242
243 private void handleToTaxon(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent, PolytomousKeyNode node) throws XMLStreamException {
244 Map<String, Attribute> attributes = getAttributes(parentEvent);
245 String num = getAndRemoveAttributeValue(attributes, NUM);
246 boolean taxonNotExists = checkAndRemoveAttributeValue(attributes, EXISTS, "false");
247
248 String taxonCData = handleInnerToTaxon(state, reader, parentEvent, node).trim();
249
250 String taxonKeyStr = makeTaxonKey(taxonCData, state.getCurrentTaxon(), parentEvent.getLocation());
251 taxonNotExists = taxonNotExists || (isBlank(num) && state.isOnlyNumberedTaxaExist());
252 if (taxonNotExists){
253 NonViralName<?> name = createNameByCode(state, Rank.UNKNOWN_RANK());
254 Taxon taxon = Taxon.NewInstance(name, null);
255 taxon.getName().setTitleCache(taxonKeyStr, true);
256 node.setTaxon(taxon);
257 }else{
258 UnmatchedLeadsKey unmatched = UnmatchedLeadsKey.NewInstance(num, taxonKeyStr);
259 state.getUnmatchedLeads().addKey(unmatched, node);
260 // String message = "The following key leads are unmatched: %s";
261 // message = String.format(message, state.getUnmatchedLeads().toString());
262 // fireWarningEvent(message, parentEvent, 6);
263 }
264 return;
265 }
266
267 /**
268 * Returns the taxon text of the toTaxon element and handles all annotations as ';'-concatenated modifying text.
269 * Footnote refs are not yet handled.
270 * @param state
271 * @param reader
272 * @param parentEvent
273 * @param node
274 * @return
275 * @throws XMLStreamException
276 */
277 private String handleInnerToTaxon(MarkupImportState state, XMLEventReader reader, XMLEvent parentEvent, PolytomousKeyNode node) throws XMLStreamException {
278 String taxonText = "";
279 String modifyingText = null;
280 while (reader.hasNext()) {
281 XMLEvent next = readNoWhitespace(reader);
282 if (isMyEndingElement(next, parentEvent)) {
283 if (isNotBlank(modifyingText)){
284 node.putModifyingText(getDefaultLanguage(state), modifyingText);
285 }
286 return taxonText;
287 } else if (next.isCharacters()) {
288 taxonText += next.asCharacters().getData();
289 } else if (isStartingElement(next, ANNOTATION)) {
290 String annotation = handleSimpleAnnotation(state, reader, next);
291 modifyingText = CdmUtils.concat("; ", modifyingText, annotation);
292 } else if (isStartingElement(next, FOOTNOTE_REF)) {
293 handleNotYetImplementedElement(next);
294 } else {
295 handleUnexpectedElement(next);
296 }
297 }
298 throw new IllegalStateException("Event has no closing tag");
299
300 }
301
302 /**
303 * Creates a string that represents the given taxon. The string will try to replace e.g.
304 * abbreviated genus epithets by its full name etc.
305 * @param strGoto
306 * @param taxon
307 * @param location
308 * @return
309 */
310 private String makeTaxonKey(String strGoto, Taxon taxon, Location location) {
311 String result = "";
312 if (strGoto == null){
313 return "";
314 }
315
316 NonViralName<?> name = CdmBase.deproxy(taxon.getName(), NonViralName.class);
317 String strGenusName = name.getGenusOrUninomial();
318
319 final String bracketPattern = "\\([^\\(\\)]*\\)";
320 final String bracketPatternSomewhere = String.format(".*%s.*", bracketPattern);
321 if (strGoto.matches(bracketPatternSomewhere)){
322 fireWarningEvent("toTaxon has bracket: " + strGoto, makeLocationStr(location), 4);
323 strGoto = strGoto.replaceAll(bracketPattern, ""); //replace all brackets
324 }
325 strGoto = strGoto.replaceAll("\\s+", " "); //replace multiple whitespaces by exactly one whitespace
326
327 strGoto = strGoto.trim();
328 strGoto = strGoto.replaceAll("\\s+\\.", "\\."); // " ." may be created by bracket replacement
329 strGoto = strGoto.replaceAll("\\.\\.", "\\."); //replace
330
331 String[] split = strGoto.split("\\s");
332 //handle single epithets and markers
333 for (int i = 0; i<split.length; i++){
334 String single = split[i];
335 if (isGenusAbbrev(single, strGenusName)){
336 split[i] = strGenusName;
337 }
338 if (isInfraSpecificMarker(single)){
339 String strSpeciesEpi = name.getSpecificEpithet();
340 if (isBlank(result) && isNotBlank(strSpeciesEpi)){
341 result += strGenusName + " " + strSpeciesEpi;
342 }
343 }
344 result = (result + " " + split[i]).trim();
345 }
346 //remove trailing "." except for "sp."
347 while (result.matches(".*(?<!sp)\\.$")){
348 result = result.substring(0, result.length()-1).trim();
349 }
350 return result;
351 }
352
353
354 private boolean isInfraSpecificMarker(String single) {
355 try {
356 if (Rank.getRankByIdInVoc(single).isInfraSpecific()){
357 return true;
358 }else{
359 return false;
360 }
361 } catch (UnknownCdmTypeException e) {
362 return false;
363 }
364 }
365
366 //******************************** recognize nodes ***********/
367
368 public void makeKeyNodes(MarkupImportState state, XMLEvent event, String taxonTitle) {
369 Taxon taxon = state.getCurrentTaxon();
370 String num = state.getCurrentTaxonNum();
371
372 String nameString = CdmBase.deproxy(taxon.getName(), NonViralName.class).getNameCache();
373 // String nameString = taxonTitle;
374
375 //try to find matching lead nodes
376 UnmatchedLeadsKey leadsKey = UnmatchedLeadsKey.NewInstance(num, nameString);
377 Set<PolytomousKeyNode> matchingNodes = handleMatchingNodes(state, taxon, leadsKey);
378
379 if (num != null){//same without using the num
380 UnmatchedLeadsKey noNumLeadsKey = UnmatchedLeadsKey.NewInstance("", nameString);
381 Set<PolytomousKeyNode> noNumMatchingNodes = handleMatchingNodes(state, taxon, noNumLeadsKey);
382 if(noNumMatchingNodes.size() > 0){
383 String message ="Taxon matches additional key node when not considering <num> attribute in taxontitle. This may be correct but may also indicate an error.";
384 fireWarningEvent(message, event, 1);
385 }
386 }
387 //report missing match, if num exists
388 if (matchingNodes.isEmpty() /* TODO redo comment && num != null */){
389 String message = "Taxon has <num> attribute in taxontitle but no matching key nodes exist: %s, Key: %s";
390 message = String.format(message, num, leadsKey.toString());
391 fireWarningEvent(message, event, 1);
392 }
393
394 }
395
396 private Set<PolytomousKeyNode> handleMatchingNodes(MarkupImportState state, Taxon taxon, UnmatchedLeadsKey leadsKey) {
397 Set<PolytomousKeyNode> matchingNodes = state.getUnmatchedLeads().getNodes(leadsKey);
398 for (PolytomousKeyNode matchingNode : matchingNodes){
399 state.getUnmatchedLeads().removeNode(leadsKey, matchingNode);
400 matchingNode.setTaxon(taxon);
401 //just to be on the save side
402 matchingNode.refreshNodeNumbering();
403 state.getPolytomousKeyNodesToSave().add(matchingNode);
404 }
405 return matchingNodes;
406 }
407 }