ref #9918 distinguish default and parsedEntity deduplication in ImportDeduplicationHe...
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / greece / FloraHellenicaSynonymImport.java
1 /**
2 * Copyright (C) 2016 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.io.greece;
10
11 import java.util.Arrays;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Set;
15 import java.util.regex.Matcher;
16 import java.util.regex.Pattern;
17
18 import org.apache.log4j.Logger;
19 import org.springframework.stereotype.Component;
20
21 import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
22 import eu.etaxonomy.cdm.model.name.INonViralName;
23 import eu.etaxonomy.cdm.model.name.NameRelationshipType;
24 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
25 import eu.etaxonomy.cdm.model.name.NomenclaturalStatusType;
26 import eu.etaxonomy.cdm.model.name.TaxonName;
27 import eu.etaxonomy.cdm.model.reference.Reference;
28 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
29 import eu.etaxonomy.cdm.model.taxon.SynonymType;
30 import eu.etaxonomy.cdm.model.taxon.Taxon;
31 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
32 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
33 import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
34
35 /**
36 * @author a.mueller
37 * @since 14.12.2016
38 */
39
40 @Component
41 public class FloraHellenicaSynonymImport<CONFIG extends FloraHellenicaImportConfigurator>
42 extends FloraHellenicaImportBase<CONFIG>{
43
44 private static final long serialVersionUID = -3565782012921316901L;
45 private static final Logger logger = Logger.getLogger(FloraHellenicaSynonymImport.class);
46
47 private static final String ACCEPTED_NAME = "Accepted name";
48 private static final String SYNONYM = "synonym";
49 private static final String UNIQUE_ID_OF_ACCEPTED_NAME = "Unique ID of accepted name";
50
51 private static List<String> expectedKeys= Arrays.asList(new String[]{
52 SYNONYM, UNIQUE_ID_OF_ACCEPTED_NAME, ACCEPTED_NAME
53 });
54
55 private NonViralNameParserImpl parser = NonViralNameParserImpl.NewInstance();
56
57 @Override
58 protected String getWorksheetName(CONFIG config) {
59 return "synonyms";
60 }
61
62 boolean isFirst = true;
63 /**
64 * {@inheritDoc}
65 */
66 @Override
67 protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
68
69 String line = state.getCurrentLine() + ": ";
70 Map<String, String> record = state.getOriginalRecord();
71
72 Set<String> keys = record.keySet();
73 for (String key: keys) {
74 if (! expectedKeys.contains(key)){
75 logger.warn(line + "Unexpected Key: " + key);
76 }
77 }
78 if (isFirst){
79 System.out.println("Start synonyms");
80 isFirst = false;
81 }
82
83 String row = "row" + state.getCurrentLine();
84 TaxonBase<?> relatedTaxon = makeSynonym(state, line, record, row);
85 if (relatedTaxon != null){
86 getTaxonService().saveOrUpdate(relatedTaxon);
87 }
88 }
89
90
91 /**
92 * @param state
93 * @param line
94 * @param record
95 * @param noStr
96 * @return
97 */
98 private TaxonBase<?> makeSynonym(SimpleExcelTaxonImportState<CONFIG> state, String line,
99 Map<String, String> record,
100 String lineId) {
101
102 Taxon acceptedTaxon = getAcceptedTaxon(record, state, UNIQUE_ID_OF_ACCEPTED_NAME);
103 if (acceptedTaxon == null){
104 logger.warn(line + "Accepted not found: " + record.get(UNIQUE_ID_OF_ACCEPTED_NAME));
105 return null;
106 // acceptedTaxon = Taxon.NewInstance(null, null);
107 }
108
109 String synonymStr = getValue(record, SYNONYM);
110
111 String[] parsedSynStr = parseAuct(synonymStr, line);
112
113 boolean isMisapplied = parsedSynStr[1] != null;
114 boolean hasNonAuthor = parsedSynStr[2] != null;
115 boolean hasStatus = parsedSynStr[3] != null;
116 boolean isNec = hasNonAuthor && parsedSynStr[2].contains(" nec ");
117
118
119 String misappliedNecAuthor = null;
120 if (isMisapplied && hasNonAuthor && !isNec){
121 parsedSynStr[0] = parsedSynStr[0] + " " + parsedSynStr[2];
122 }else if (isMisapplied && hasNonAuthor && isNec){
123 misappliedNecAuthor = parsedSynStr[2];
124 }
125
126 INonViralName nvn = parser.parseFullName(parsedSynStr[0], NomenclaturalCode.ICNAFP, null);
127 if (nvn.isProtectedTitleCache()){
128 logger.warn(line + "Name could not be parsed: " + parsedSynStr[0] + " (full:" + synonymStr + ")");
129 }
130 if (misappliedNecAuthor != null){
131 nvn.setAuthorshipCache(misappliedNecAuthor);
132 }
133 TaxonName name = TaxonName.castAndDeproxy(nvn);
134
135 if (hasStatus){
136 try {
137 NomenclaturalStatusType status = NomenclaturalStatusType.getNomenclaturalStatusTypeByAbbreviation(parsedSynStr[3], name);
138 name.addStatus(status, null, null);
139 } catch (UnknownCdmTypeException e) {
140 logger.warn(line + "Nom. status not recognized: " + parsedSynStr[3]);
141 }
142 }
143 name = replaceNameAuthorsAndReferences(state, name, true);
144
145
146 TaxonBase<?> result;
147 if (isMisapplied){
148 Reference sec = null;// getMisappliedRef(state, parsedSynStr[1]);
149 result = Taxon.NewInstance(name, sec);
150 result.setAppendedPhrase(getMisappliedRef(state, parsedSynStr[1]));
151 acceptedTaxon.addMisappliedName((Taxon)result, getSecReference(state), null);
152 if (isNec){
153 logger.warn(line + "nec for misapplied names still needs to be checked: " + synonymStr);
154 }
155 }else{
156 SynonymType synType = null;
157 result = acceptedTaxon.addSynonymName(name, getSecReference(state), null, synType);
158 if (hasNonAuthor){
159 handleSynonymNon(state, name, parsedSynStr[2], line);
160 }
161 }
162 result.addImportSource(lineId, getWorksheetName(state.getConfig()), getSourceCitation(state), null);
163
164 return result;
165
166 }
167
168
169
170 /**
171 * @param state
172 * @param name
173 * @param parsedSynStr
174 */
175 private void handleSynonymNon(SimpleExcelTaxonImportState<CONFIG> state,
176 TaxonName name, String nonPart, String line) {
177 String[] splits = nonPart.split(" nec ");
178
179 TaxonName lastHomonym = null;
180 for (String split : splits){
181 split = split.trim();
182 // Saponaria illyrica Ard.
183 // Crepis nemausensis Gouan
184 // S. columnae Aurnier
185 // S. columnae Aurnier nec (Rchb. f.) H. Fleischm.
186 // T. glaucescens Rchb.
187 TaxonName nonName;
188 if (split.matches("(Saponaria illyrica Ard.|Crepis nemausensis Gouan|S. columnae Aurnier|T. glaucescens Rchb.|Linaria stricta Guss.)"
189 + "")){
190 if (split.startsWith("S.")){
191 split = split.replace("S.", "Serapias");
192 }else if (split.startsWith("T.")){
193 split = split.replace("T.", "Taraxacum");
194 }
195 nonName = TaxonName.castAndDeproxy(this.parser.parseFullName(split));
196 nonName = replaceNameAuthorsAndReferences(state, nonName, true);
197 name.addRelationshipFromName(nonName, NameRelationshipType.BLOCKING_NAME_FOR(), null, null);
198 }else{
199 String nameStr = name.getNameCache().replace(" hort.", "") + " " + split;
200 nonName = TaxonName.castAndDeproxy(this.parser.parseFullName(nameStr));
201 nonName = replaceNameAuthorsAndReferences(state, nonName, true);
202 name.addRelationshipToName(nonName, NameRelationshipType.LATER_HOMONYM(), null, null);
203 if (lastHomonym != null){
204 nonName.addRelationshipToName(lastHomonym, NameRelationshipType.LATER_HOMONYM(), null, null);
205 }
206 lastHomonym = nonName;
207 }
208 getNameService().saveOrUpdate(nonName);
209 if (nonName.isProtectedTitleCache()){
210 logger.warn(line + "Non-Name could not be parsed: " + nonName.getTitleCache());
211 }
212 }
213 //seems to work correctly
214 // if (splits.length>1){
215 // logger.warn(line + "nec synonyms maybe not yet correctly implemented: " + name.getTitleCache() + "; " + nonPart);
216 // }
217 }
218
219 private Reference flGraecReference;
220 private Reference balkanReference;
221 {
222 flGraecReference = ReferenceFactory.newBook();
223 flGraecReference.setTitle("fl. graec.");
224 balkanReference = ReferenceFactory.newBook();
225 balkanReference.setTitle("balc.");
226 }
227 /**
228 * @param state
229 * @param string
230 * @return
231 */
232 private String getMisappliedRef(SimpleExcelTaxonImportState<CONFIG> state, String refString) {
233 // if ("fl. graec.".equals(refString)){
234 // return flGraecReference;
235 // }else if ("balc.".equals(refString)){
236 // return balkanReference;
237 if ("fl. graec.".equals(refString)){
238 return "auct. fl. graec.";
239 }else if ("balc.".equals(refString)){
240 return "auct. balc.";
241 }else{
242 logger.warn("Auct. reference not recognized: " + refString);
243 return null;
244 }
245 }
246
247 private String regExMisapplied = "(.+) auct\\. (fl\\. graec\\.|balc\\.), non (.+)";
248 private Pattern patternMisapplied = Pattern.compile(regExMisapplied);
249
250 private String regExNon = "(.+), non (.+)";
251 private Pattern patternNon = Pattern.compile(regExNon);
252
253 private String regExStatus = "(.+),\\s+((?:nom.|comb.|orth.)\\s+(.+))";
254 private Pattern patternStat = Pattern.compile(regExStatus);
255
256 /**
257 * @param synonymStr
258 */
259 private String[] parseAuct(String synonymStr, String line) {
260 String[] result = new String[4];
261 if (synonymStr != null){
262 result[0] = synonymStr;
263 Matcher matcher = patternMisapplied.matcher(synonymStr);
264 if (matcher.matches()){
265 result[0] = matcher.group(1);
266 result[1] = matcher.group(2);
267 if (! result[1].equals("fl. graec.") && ! result[1].equals("balc.")){
268 logger.warn(line + "Misapplied sensu not recognized: " + result[1]);
269 }
270 result[2] = matcher.group(3);
271 }else{
272 matcher = patternNon.matcher(synonymStr);
273 if (matcher.matches()){
274 result[0] = matcher.group(1);
275 result[2] = matcher.group(2);
276 }else{
277 matcher = patternStat.matcher(synonymStr);
278 if (matcher.matches()){
279 result[0] = matcher.group(1);
280 result[3] = matcher.group(2);
281 }
282 }
283 }
284 }
285 return result;
286 }
287
288 }