ref #6636 handle Books correctly in RIS import
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / reference / ris / in / RisReferenceImport.java
1 /**
2 * Copyright (C) 2017 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9 package eu.etaxonomy.cdm.io.reference.ris.in;
10
11 import java.io.InputStreamReader;
12 import java.net.URI;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.HashSet;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.Set;
19
20 import org.apache.log4j.Logger;
21 import org.springframework.stereotype.Component;
22
23 import eu.etaxonomy.cdm.common.CdmUtils;
24 import eu.etaxonomy.cdm.common.DOI;
25 import eu.etaxonomy.cdm.io.common.CdmImportBase;
26 import eu.etaxonomy.cdm.io.reference.ris.in.RisRecordReader.RisValue;
27 import eu.etaxonomy.cdm.model.agent.Person;
28 import eu.etaxonomy.cdm.model.agent.Team;
29 import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
30 import eu.etaxonomy.cdm.model.common.Annotation;
31 import eu.etaxonomy.cdm.model.common.AnnotationType;
32 import eu.etaxonomy.cdm.model.common.Language;
33 import eu.etaxonomy.cdm.model.common.TimePeriod;
34 import eu.etaxonomy.cdm.model.reference.Reference;
35 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
36 import eu.etaxonomy.cdm.model.reference.ReferenceType;
37
38 /**
39 * @author a.mueller
40 * @date 11.05.2017
41 *
42 */
43 @Component
44 public class RisReferenceImport
45 extends CdmImportBase<RisReferenceImportConfigurator, RisReferenceImportState>{
46
47 private static final long serialVersionUID = 7022034669942979722L;
48 @SuppressWarnings("unused")
49 private static final Logger logger = Logger.getLogger(RisReferenceImport.class);
50
51 /**
52 * {@inheritDoc}
53 */
54 @Override
55 protected void doInvoke(RisReferenceImportState state) {
56 RisReferenceImportConfigurator config = state.getConfig();
57 try {
58 // new FileReader(file)
59 InputStreamReader inputReader = config.getSource();
60 RisRecordReader risReader = new RisRecordReader(state, inputReader);
61
62 Set<Reference> referencesToSave = new HashSet<>();
63
64
65 Map<RisReferenceTag, List<RisValue>> next = risReader.readRecord();
66 while (next != RisRecordReader.EOF){
67 Reference ref;
68 String location = "";
69 try {
70 location = recordLocation(state, next);
71 ref = makeReference(state, next);
72 referencesToSave.add(ref);
73 if (ref.getInReference() != null){
74 referencesToSave.add(ref.getInReference());
75 }
76 } catch (Exception e) {
77 String message = "Unexpected exception during RIS Reference Import";
78 state.getResult().addException(e, message, location);
79 }
80
81 next = risReader.readRecord();
82 }
83
84 getReferenceService().saveOrUpdate(referencesToSave);
85 state.getResult().addNewRecords(Reference.class.getSimpleName(), referencesToSave.size());
86
87 } catch (Exception e) {
88 String message = "Unexpected exception during RIS Reference Import";
89 state.getResult().addException(e, message);
90 }
91
92 //unhandled
93 Map<RisReferenceTag, Integer> unhandled = state.getUnhandled();
94 for (RisReferenceTag tag : unhandled.keySet()){
95 String message = "RIS tag %s (%s) not yet handled. n = %d";
96 message = String .format(message, tag.name(), tag.getDescription(), unhandled.get(tag));
97 state.getResult().addWarning(message);
98 }
99 }
100
101 /**
102 * @param state
103 * @param next
104 * @return
105 */
106 private Reference makeReference(RisReferenceImportState state,
107 Map<RisReferenceTag, List<RisValue>> record) {
108
109 //type
110 ReferenceType type = makeReferenceType(state, record);
111 Reference ref = ReferenceFactory.newReference(type);
112 Reference inRef = null;
113 if (hasInRef(ref)){
114 ReferenceType inRefType =
115 type == ReferenceType.Article ? ReferenceType.Journal:
116 type == ReferenceType.BookSection ? ReferenceType.Book :
117 ReferenceType.Generic;
118 inRef = ReferenceFactory.newReference(inRefType);
119 ref.setInReference(inRef);
120 }
121 Reference higherRef = inRef == null ? ref : inRef;
122
123
124 //Title
125 RisValue t1 = getSingleValue(state, record, RisReferenceTag.T1);
126 RisValue ti = getSingleValue(state, record, RisReferenceTag.TI);
127 RisValue value = assertEqual(state, "title", t1, ti);
128 if (value != null){
129 ref.setTitle(value.value);
130 }
131
132 //Journal title
133 RisValue t2 = getSingleValue(state, record, RisReferenceTag.T2); //Secondary Title (journal title, if applicable)
134
135 if (higherRef.getType() == ReferenceType.Journal){
136 RisValue jf = getSingleValue(state, record, RisReferenceTag.JF); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
137 RisValue jo = getSingleValue(state, record, RisReferenceTag.JO); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
138 RisValue x = assertEqual(state, "Journal/Periodical name: full format", jf, jo);
139 x = assertEqual(state, "Journal title", t2, x);
140 if (x != null){
141 higherRef.setTitle(x.value);
142 }
143 }else{
144 //TODO
145 }
146
147 //ST (remove as same as TI or T1), not handled otherwise
148 RisValue st = getSingleValue(state, record, RisReferenceTag.ST, false); //Short title
149 if (st != null && st.value.equals(ref.getTitle())){
150 record.remove(RisReferenceTag.ST);
151 }
152
153 //Author
154 List<RisValue> list = getListValue(record, RisReferenceTag.AU);
155 if (!list.isEmpty()){
156 TeamOrPersonBase<?> author = makeAuthor(state, list);
157 ref.setAuthorship(author);
158 }
159
160 //Date
161 // RisValue y1 = getSingleValue(state, record, RisReferenceTag.Y1); //Primary Date
162 RisValue py = getSingleValue(state, record, RisReferenceTag.PY);
163 RisValue da = getSingleValue(state, record, RisReferenceTag.DA);
164 Integer year = makeYear(state, py);
165 TimePeriod date = makeDate(state, da);
166 assertDateYear(state, year, date, py);
167 ref.setDatePublished(date);
168 //TODO y1 not yet handled
169
170 //Note
171 RisValue n1 = getSingleValue(state, record, RisReferenceTag.N1); //Note
172 if (n1 != null){
173 Annotation annotation = Annotation.NewInstance(n1.value, AnnotationType.EDITORIAL(), Language.DEFAULT());
174 ref.addAnnotation(annotation);
175 }
176
177 //DOI
178 RisValue doiVal = getSingleValue(state, record, RisReferenceTag.DO); //Doi
179 if (doiVal != null){
180 DOI doi;
181 try {
182 String doiStr = doiVal.value;
183 if (doiStr.toLowerCase().startsWith("doi ")){
184 doiStr = doiStr.substring(4).trim();
185 }
186 doi = DOI.fromString(doiStr);
187 ref.setDoi(doi);
188 } catch (IllegalArgumentException e) {
189 String message = "DOI could not be recognized: " + doiVal.value;
190 state.getResult().addWarning(message, doiVal.location);
191 }
192 }
193
194 //UR
195 RisValue ur = getSingleValue(state, record, RisReferenceTag.UR); //URL
196 if (ur != null){
197 URI uri;
198 try {
199 String urStr = ur.value;
200 uri = URI.create(urStr);
201 ref.setUri(uri);
202 } catch (Exception e) {
203 String message = "URL could not be recognized: " + ur.value;
204 state.getResult().addWarning(message, ur.location);
205 }
206 }
207
208 //Pages
209 RisValue sp = getSingleValue(state, record, RisReferenceTag.SP);
210 RisValue ep = getSingleValue(state, record, RisReferenceTag.EP);
211 String pages = CdmUtils.concat("-", sp != null ? sp.value : null, ep != null ? ep.value : null);
212 ref.setPages(pages);
213
214 //Volume
215 RisValue vl = getSingleValue(state, record, RisReferenceTag.VL);
216 RisValue is = getSingleValue(state, record, RisReferenceTag.IS);
217 String vol = vl == null? "": vl.value + (is != null ? "("+ is.value + ")": "");
218 ref.setVolume(vol);
219
220 //Publisher
221 RisValue pb = getSingleValue(state, record, RisReferenceTag.PB);
222 if (pb != null){
223 higherRef.setPublisher(pb.value);
224 }
225
226 //Abstract
227 RisValue ab = getSingleValue(state, record, RisReferenceTag.AB);
228 RisValue n2 = getSingleValue(state, record, RisReferenceTag.N2);
229 RisValue abst = assertEqual(state, "Abstract", ab, n2);
230 if (abst != null){
231 ref.setReferenceAbstract(abst.value);
232 }
233
234 //ISSN/ISBN
235 RisValue sn = getSingleValue(state, record, RisReferenceTag.SN);
236 if (sn != null){
237 if (higherRef.getType() == ReferenceType.Journal){
238 higherRef.setIssn(sn.value);
239 }else{
240 higherRef.setIsbn(sn.value);
241 }
242 }
243
244 //ID
245 RisValue id = getSingleValue(state, record, RisReferenceTag.ID);
246 String idStr = id != null? id.value: null;
247 String recLoc = recordLocation(state, record);
248 ref.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
249 if (inRef != null){
250 ref.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
251
252 }
253
254 //remove
255 record.remove(RisReferenceTag.ER);
256 record.remove(RisReferenceTag.TY);
257
258 for (RisReferenceTag tag : record.keySet()){
259 // String message = "RIS Tag " + tag.name() + " not yet handled";
260 // state.getResult().addWarning(message, record.get(tag).get(0).location);
261 state.addUnhandled(tag);
262
263 //TODO add as annotation or extension
264 }
265
266 return ref;
267 }
268
269 /**
270 * @param ref
271 * @return
272 */
273 private boolean hasInRef(Reference ref) {
274 return ref.getType() == ReferenceType.BookSection || ref.getType() == ReferenceType.Article ;
275 }
276
277
278 /**
279 * @param state
280 * @param record
281 * @return
282 */
283 private String recordLocation(RisReferenceImportState state,
284 Map<RisReferenceTag, List<RisValue>> record) {
285 RisValue typeTag = this.getSingleValue(state, record, RisReferenceTag.TY, false);
286 RisValue erTag = this.getSingleValue(state, record, RisReferenceTag.ER, false);
287
288 String start = typeTag == null ? "??" : typeTag.location;
289 String end = erTag == null ? "??" : erTag.location;
290
291 String result = "line " + CdmUtils.concat("-", start, end);
292
293 return result;
294 }
295
296 /**
297 * @param state
298 * @param year
299 * @param date
300 */
301 private void assertDateYear(RisReferenceImportState state, Integer year, TimePeriod date, RisValue py) {
302 if (year != null && date != null && !year.equals(date.getStartYear())){
303 String message = "Year 'PY' and date 'DA' are not consistent. PY is neglected.";
304 state.getResult().addWarning(message, py.location);
305 }
306 }
307
308 private RisValue assertEqual(RisReferenceImportState state, String meaning, RisValue val1, RisValue val2) {
309 if (val1 != null && val2 != null && !val1.value.equals(val2.value)){
310 String message = "The tags '%s' and '%s' are not equal but have a similar meaning ('%s'). "
311 + "%s was used and %s neglected";
312 message = String.format(message, val1.tag.name(), val2.tag.name(), meaning , val1.tag.name(), val2.tag.name());
313 state.getResult().addWarning(message, val1.location);
314 }
315 return val1 != null ? val1 : val2;
316 }
317
318 /**
319 * @param state
320 * @param da
321 * @return
322 */
323 private TimePeriod makeDate(RisReferenceImportState state, RisValue da) {
324 if (da == null){
325 return null;
326 }
327 if (! da.value.matches("([0-9]{4})?(\\/([0-9]{2})?(\\/([0-9]{2})?(\\/.*)?)?)?")){
328 String message = "Tag '%s' has incorrect format. Only exactly 'dddd/dd/dd/any text' is allowed (where d is a digit), but was '%s'";
329 message = String.format(message, da.tag.name(), da.value);
330 state.getResult().addWarning(message, da.location);
331 return null;
332 }
333 String[] split = da.value.split("/");
334 TimePeriod tp = TimePeriod.NewInstance();
335 if (split.length > 0 && isNotBlank(split[0])){
336 tp.setStartYear(Integer.valueOf(split[0]));
337 }
338 if (split.length > 1 && isNotBlank(split[1])){
339 tp.setStartMonth(Integer.valueOf(split[1]));
340 }
341 if (split.length > 2 && isNotBlank(split[2])){
342 tp.setStartDay(Integer.valueOf(split[2]));
343 }
344 if (split.length > 3 && isNotBlank(split[3])){
345 List<String> other = Arrays.asList(split).subList(3, split.length);
346 String otherStr = CdmUtils.concat("/", other.toArray(new String[other.size()]));
347 tp.setFreeText(tp.toString() + " " + otherStr);
348 }
349 return tp;
350 }
351
352 /**
353 * @param state
354 * @param py
355 * @return
356 */
357 private Integer makeYear(RisReferenceImportState state, RisValue py) {
358 if (py == null){
359 return null;
360 }
361 if (py.value.matches("[0-9]{4}")){
362 return Integer.valueOf(py.value);
363 }else{
364 String message = "Tag '%s' has incorrect format. Only exactly 4 digits are allowed, but was '%s'";
365 message = String.format(message, py.tag.name(), py.value);
366 state.getResult().addWarning(message, py.location);
367 return null;
368 }
369 }
370
371 /**
372 * @param state
373 * @param list
374 * @return
375 */
376 private TeamOrPersonBase<?> makeAuthor(RisReferenceImportState state, List<RisValue> list) {
377 if (list.size() == 1){
378 return makePerson(state, list.get(0));
379 }else{
380 Team team = Team.NewInstance();
381 for (RisValue value : list){
382 team.addTeamMember(makePerson(state, value));
383 }
384 return team;
385 }
386 }
387
388 /**
389 * @param state
390 * @param risValue
391 * @return
392 */
393 private Person makePerson(RisReferenceImportState state, RisValue risValue) {
394 Person person = Person.NewInstance();
395 String[] split = risValue.value.split(",");
396 if (split.length >= 1){
397 person.setLastname(split[0].trim());
398 }
399 if (split.length >= 2){
400 person.setFirstname(split[1].trim());
401 }
402 if (split.length >= 3){
403 person.setSuffix(split[2].trim());
404 }
405
406 return person;
407 }
408
409 /**
410 * Returns the single value for the given tag
411 * and removes the tag from the record.
412 * If more than 1 value exists this is logged
413 * as a warning.
414 */
415 private RisValue getSingleValue(RisReferenceImportState state,
416 Map<RisReferenceTag, List<RisValue>> record,
417 RisReferenceTag tag) {
418 return getSingleValue(state, record, tag, true);
419 }
420
421 /**
422 * Returns the single value for the given tag
423 * and removes the tag from the record.
424 * If more than 1 value exists this is logged
425 * as a warning.
426 */
427 private RisValue getSingleValue(RisReferenceImportState state,
428 Map<RisReferenceTag, List<RisValue>> record,
429 RisReferenceTag tag, boolean remove) {
430 List<RisValue> list = record.get(tag);
431 if (list == null){
432 return null;
433 }
434 assertSingle(state, list, tag);
435 if (remove){
436 record.remove(tag);
437 }
438 return list.get(0);
439 }
440
441 private List<RisValue> getListValue(Map<RisReferenceTag, List<RisValue>> record,
442 RisReferenceTag tag) {
443 List<RisValue> list = record.get(tag);
444 record.remove(tag);
445 if (list == null){
446 list = new ArrayList<>();
447 }
448 return list;
449 }
450
451 /**
452 * @param state
453 * @param list
454 * @param tag
455 */
456 private void assertSingle(RisReferenceImportState state, List<RisValue> list, RisReferenceTag tag) {
457 if (list.size() > 1){
458 String message = "There is more than 1 tag '%s' but only 1 tag is supported by RIS format or"
459 + " by the current import implementation.";
460 message = String.format(message, tag.name());
461 state.getResult().addWarning(message, list.get(0).location + "ff");
462 }else if (list.isEmpty()){
463 state.getResult().addError("A tag list was empty. This should not happen and is a programming code error");
464 }
465 }
466
467 /**
468 * @param state
469 * @param next
470 * @return
471 */
472 private ReferenceType makeReferenceType(RisReferenceImportState state,
473 Map<RisReferenceTag, List<RisValue>> record) {
474 RisReferenceTag tyTag = RisReferenceTag.TY;
475 RisValue value = this.getSingleValue(state, record, tyTag, false);
476 String typeStr = value.value;
477 RisRecordType type = RisRecordType.valueOf(typeStr);
478 ReferenceType cdmType = type.getCdmReferenceType();
479 return cdmType;
480 }
481
482 /**
483 * {@inheritDoc}
484 */
485 @Override
486 protected boolean doCheck(RisReferenceImportState state) {
487 return true;
488 }
489
490 /**
491 * {@inheritDoc}
492 */
493 @Override
494 protected boolean isIgnore(RisReferenceImportState state) {
495 return false;
496 }
497 }