1
|
/**
|
2
|
* Copyright (C) 2017 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
package eu.etaxonomy.cdm.io.reference.ris.in;
|
10
|
|
11
|
import java.io.ByteArrayInputStream;
|
12
|
import java.io.InputStreamReader;
|
13
|
import java.util.ArrayList;
|
14
|
import java.util.Arrays;
|
15
|
import java.util.HashSet;
|
16
|
import java.util.List;
|
17
|
import java.util.Map;
|
18
|
import java.util.Set;
|
19
|
|
20
|
import org.apache.log4j.Logger;
|
21
|
import org.springframework.stereotype.Component;
|
22
|
|
23
|
import eu.etaxonomy.cdm.common.CdmUtils;
|
24
|
import eu.etaxonomy.cdm.common.DOI;
|
25
|
import eu.etaxonomy.cdm.common.URI;
|
26
|
import eu.etaxonomy.cdm.io.common.CdmImportBase;
|
27
|
import eu.etaxonomy.cdm.io.reference.ris.in.RisRecordReader.RisValue;
|
28
|
import eu.etaxonomy.cdm.model.agent.Person;
|
29
|
import eu.etaxonomy.cdm.model.agent.Team;
|
30
|
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
|
31
|
import eu.etaxonomy.cdm.model.common.Annotation;
|
32
|
import eu.etaxonomy.cdm.model.common.AnnotationType;
|
33
|
import eu.etaxonomy.cdm.model.common.Language;
|
34
|
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
|
35
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
36
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
37
|
import eu.etaxonomy.cdm.model.reference.ReferenceType;
|
38
|
import eu.etaxonomy.cdm.strategy.parser.TimePeriodParser;
|
39
|
|
40
|
/**
|
41
|
* @author a.mueller
|
42
|
* @since 11.05.2017
|
43
|
*/
|
44
|
@Component
|
45
|
public class RisReferenceImport
|
46
|
extends CdmImportBase<RisReferenceImportConfigurator, RisReferenceImportState>{
|
47
|
|
48
|
private static final long serialVersionUID = 7022034669942979722L;
|
49
|
@SuppressWarnings("unused")
|
50
|
private static final Logger logger = Logger.getLogger(RisReferenceImport.class);
|
51
|
|
52
|
@Override
|
53
|
protected void doInvoke(RisReferenceImportState state) {
|
54
|
RisReferenceImportConfigurator config = state.getConfig();
|
55
|
try {
|
56
|
// new FileReader(file)
|
57
|
byte[] data = config.getStream();
|
58
|
|
59
|
ByteArrayInputStream stream = new ByteArrayInputStream(data);
|
60
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
61
|
RisRecordReader risReader = new RisRecordReader(state, reader);
|
62
|
|
63
|
Set<Reference> referencesToSave = new HashSet<>();
|
64
|
|
65
|
Map<RisReferenceTag, List<RisValue>> next = risReader.readRecord();
|
66
|
while (next != RisRecordReader.EOF){
|
67
|
Reference ref;
|
68
|
String location = "";
|
69
|
try {
|
70
|
location = recordLocation(state, next);
|
71
|
ref = makeReference(state, next);
|
72
|
referencesToSave.add(ref);
|
73
|
if (ref.getInReference() != null){
|
74
|
referencesToSave.add(ref.getInReference());
|
75
|
}
|
76
|
} catch (Exception e) {
|
77
|
String message = "Unexpected exception during RIS Reference Import";
|
78
|
state.getResult().addException(e, message, location);
|
79
|
}
|
80
|
|
81
|
next = risReader.readRecord();
|
82
|
}
|
83
|
|
84
|
getReferenceService().saveOrUpdate(referencesToSave);
|
85
|
state.getResult().addNewRecords(Reference.class.getSimpleName(), referencesToSave.size());
|
86
|
|
87
|
} catch (Exception e) {
|
88
|
String message = "Unexpected exception during RIS Reference Import";
|
89
|
state.getResult().addException(e, message);
|
90
|
}
|
91
|
|
92
|
//unhandled
|
93
|
Map<RisReferenceTag, Integer> unhandled = state.getUnhandled();
|
94
|
for (RisReferenceTag tag : unhandled.keySet()){
|
95
|
String message = "RIS tag %s (%s) not yet handled. n = %d";
|
96
|
message = String .format(message, tag.name(), tag.getDescription(), unhandled.get(tag));
|
97
|
state.getResult().addWarning(message);
|
98
|
}
|
99
|
}
|
100
|
|
101
|
private Reference makeReference(RisReferenceImportState state,
|
102
|
Map<RisReferenceTag, List<RisValue>> record) {
|
103
|
|
104
|
//type
|
105
|
ReferenceType type = makeReferenceType(state, record);
|
106
|
Reference ref = ReferenceFactory.newReference(type);
|
107
|
Reference inRef = null;
|
108
|
if (hasInRef(ref)){
|
109
|
ReferenceType inRefType =
|
110
|
type == ReferenceType.Article ? ReferenceType.Journal:
|
111
|
type == ReferenceType.BookSection ? ReferenceType.Book :
|
112
|
ReferenceType.Generic;
|
113
|
inRef = ReferenceFactory.newReference(inRefType);
|
114
|
ref.setInReference(inRef);
|
115
|
}
|
116
|
Reference higherRef = inRef == null ? ref : inRef;
|
117
|
|
118
|
//Title
|
119
|
RisValue t1 = getSingleValue(state, record, RisReferenceTag.T1);
|
120
|
RisValue ti = getSingleValue(state, record, RisReferenceTag.TI);
|
121
|
RisValue value = assertEqual(state, "title", t1, ti);
|
122
|
if (value != null){
|
123
|
ref.setTitle(value.value);
|
124
|
}
|
125
|
|
126
|
//Journal title
|
127
|
RisValue t2 = getSingleValue(state, record, RisReferenceTag.T2); //Secondary Title (journal title, if applicable)
|
128
|
|
129
|
if (higherRef.getType() == ReferenceType.Journal){
|
130
|
RisValue jf = getSingleValue(state, record, RisReferenceTag.JF); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
|
131
|
RisValue jo = getSingleValue(state, record, RisReferenceTag.JO); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
|
132
|
RisValue x = assertEqual(state, "Journal/Periodical name: full format", jf, jo);
|
133
|
x = assertEqual(state, "Journal title", t2, x);
|
134
|
if (x != null){
|
135
|
higherRef.setTitle(x.value);
|
136
|
}
|
137
|
}else{
|
138
|
//TODO
|
139
|
}
|
140
|
|
141
|
//ST (remove as same as TI or T1), not handled otherwise
|
142
|
RisValue st = getSingleValue(state, record, RisReferenceTag.ST, false); //Short title
|
143
|
if (st != null && st.value.equals(ref.getTitle())){
|
144
|
record.remove(RisReferenceTag.ST);
|
145
|
}
|
146
|
|
147
|
//Author
|
148
|
List<RisValue> list = getListValue(record, RisReferenceTag.AU);
|
149
|
if (!list.isEmpty()){
|
150
|
TeamOrPersonBase<?> author = makeAuthor(state, list);
|
151
|
ref.setAuthorship(author);
|
152
|
}
|
153
|
|
154
|
//Date
|
155
|
// RisValue y1 = getSingleValue(state, record, RisReferenceTag.Y1); //Primary Date
|
156
|
RisValue py = getSingleValue(state, record, RisReferenceTag.PY);
|
157
|
RisValue da = getSingleValue(state, record, RisReferenceTag.DA);
|
158
|
Integer year = makeYear(state, py);
|
159
|
VerbatimTimePeriod date = makeDate(state, da);
|
160
|
date = assertDateYear(state, year, date, py);
|
161
|
ref.setDatePublished(date);
|
162
|
//TODO y1 not yet handled
|
163
|
|
164
|
//Note
|
165
|
RisValue n1 = getSingleValue(state, record, RisReferenceTag.N1); //Note
|
166
|
if (n1 != null){
|
167
|
Annotation annotation = Annotation.NewInstance(n1.value, AnnotationType.EDITORIAL(), Language.DEFAULT());
|
168
|
ref.addAnnotation(annotation);
|
169
|
}
|
170
|
|
171
|
//DOI
|
172
|
RisValue doiVal = getSingleValue(state, record, RisReferenceTag.DO); //Doi
|
173
|
if (doiVal != null){
|
174
|
DOI doi;
|
175
|
try {
|
176
|
String doiStr = doiVal.value;
|
177
|
if (doiStr.toLowerCase().startsWith("doi ")){
|
178
|
doiStr = doiStr.substring(4).trim();
|
179
|
}
|
180
|
doi = DOI.fromString(doiStr);
|
181
|
ref.setDoi(doi);
|
182
|
} catch (IllegalArgumentException e) {
|
183
|
String message = "DOI could not be recognized: " + doiVal.value;
|
184
|
state.getResult().addWarning(message, null, doiVal.location);
|
185
|
}
|
186
|
}
|
187
|
|
188
|
//UR
|
189
|
RisValue ur = getSingleValue(state, record, RisReferenceTag.UR); //URL
|
190
|
if (ur != null){
|
191
|
URI uri;
|
192
|
try {
|
193
|
String urStr = ur.value;
|
194
|
uri = URI.create(urStr);
|
195
|
ref.setUri(uri);
|
196
|
} catch (Exception e) {
|
197
|
String message = "URL could not be recognized: " + ur.value;
|
198
|
state.getResult().addWarning(message, null, ur.location);
|
199
|
}
|
200
|
}
|
201
|
|
202
|
//Pages
|
203
|
RisValue sp = getSingleValue(state, record, RisReferenceTag.SP);
|
204
|
RisValue ep = getSingleValue(state, record, RisReferenceTag.EP);
|
205
|
String pages = CdmUtils.concat("-", sp != null ? sp.value : null, ep != null ? ep.value : null);
|
206
|
ref.setPages(pages);
|
207
|
|
208
|
//Volume
|
209
|
RisValue vl = getSingleValue(state, record, RisReferenceTag.VL);
|
210
|
RisValue is = getSingleValue(state, record, RisReferenceTag.IS);
|
211
|
String vol = vl == null? "": vl.value + (is != null ? "("+ is.value + ")": "");
|
212
|
ref.setVolume(vol);
|
213
|
|
214
|
//Publisher
|
215
|
RisValue pb = getSingleValue(state, record, RisReferenceTag.PB);
|
216
|
if (pb != null){
|
217
|
higherRef.setPublisher(pb.value);
|
218
|
}
|
219
|
|
220
|
//CY - Place published
|
221
|
RisValue cy = getSingleValue(state, record, RisReferenceTag.CY);
|
222
|
if (cy != null){
|
223
|
higherRef.setPlacePublished(cy.value);
|
224
|
}
|
225
|
|
226
|
//Abstract
|
227
|
RisValue ab = getSingleValue(state, record, RisReferenceTag.AB);
|
228
|
RisValue n2 = getSingleValue(state, record, RisReferenceTag.N2);
|
229
|
RisValue abst = assertEqual(state, "Abstract", ab, n2);
|
230
|
if (abst != null){
|
231
|
ref.setReferenceAbstract(abst.value);
|
232
|
}
|
233
|
|
234
|
//ISSN/ISBN
|
235
|
RisValue sn = getSingleValue(state, record, RisReferenceTag.SN);
|
236
|
if (sn != null){
|
237
|
if (higherRef.getType() == ReferenceType.Journal){
|
238
|
higherRef.setIssn(sn.value);
|
239
|
}else{
|
240
|
higherRef.setIsbn(sn.value);
|
241
|
}
|
242
|
}
|
243
|
|
244
|
//ID
|
245
|
RisValue id = getSingleValue(state, record, RisReferenceTag.ID);
|
246
|
String idStr = id != null? id.value: null;
|
247
|
String recLoc = recordLocation(state, record);
|
248
|
ref.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
|
249
|
if (inRef != null){
|
250
|
inRef.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
|
251
|
}
|
252
|
|
253
|
//remove
|
254
|
record.remove(RisReferenceTag.ER);
|
255
|
record.remove(RisReferenceTag.TY);
|
256
|
|
257
|
for (RisReferenceTag tag : record.keySet()){
|
258
|
// String message = "RIS Tag " + tag.name() + " not yet handled";
|
259
|
// state.getResult().addWarning(message, record.get(tag).get(0).location);
|
260
|
state.addUnhandled(tag);
|
261
|
|
262
|
//TODO add as annotation or extension
|
263
|
}
|
264
|
|
265
|
return ref;
|
266
|
}
|
267
|
|
268
|
private boolean hasInRef(Reference ref) {
|
269
|
return ref.getType() == ReferenceType.BookSection || ref.getType() == ReferenceType.Article ;
|
270
|
}
|
271
|
|
272
|
private String recordLocation(RisReferenceImportState state,
|
273
|
Map<RisReferenceTag, List<RisValue>> record) {
|
274
|
RisValue typeTag = this.getSingleValue(state, record, RisReferenceTag.TY, false);
|
275
|
RisValue erTag = this.getSingleValue(state, record, RisReferenceTag.ER, false);
|
276
|
|
277
|
String start = typeTag == null ? "??" : typeTag.location;
|
278
|
String end = erTag == null ? "??" : erTag.location;
|
279
|
|
280
|
String result = "line " + CdmUtils.concat(" - ", start, end);
|
281
|
|
282
|
return result;
|
283
|
}
|
284
|
|
285
|
private VerbatimTimePeriod assertDateYear(RisReferenceImportState state, Integer year, VerbatimTimePeriod date, RisValue py) {
|
286
|
if (year == null && date == null){
|
287
|
return null;
|
288
|
}else if (year == null){
|
289
|
return date;
|
290
|
}else if (date == null){
|
291
|
return TimePeriodParser.parseStringVerbatim(String.valueOf(year));
|
292
|
}else{
|
293
|
if (!year.equals(date.getStartYear())){
|
294
|
if (date.getStartYear() == null){
|
295
|
date.setStartYear(year);
|
296
|
}else if (isNotBlank(date.getFreeText())){
|
297
|
date.setStartYear(year); //does this happen at all?
|
298
|
String message = "Year 'PY' and date 'DA' are not consistent. PY is neglected.";
|
299
|
state.getResult().addWarning(message, null, py.location);
|
300
|
return date;
|
301
|
}else{
|
302
|
String message = "Year 'PY' and date 'DA' are not consistent. DA is used for freetext and PY is used for (start) year.";
|
303
|
state.getResult().addWarning(message, null, py.location);
|
304
|
return date;
|
305
|
}
|
306
|
}
|
307
|
return date;
|
308
|
}
|
309
|
}
|
310
|
|
311
|
private RisValue assertEqual(RisReferenceImportState state, String meaning, RisValue val1, RisValue val2) {
|
312
|
if (val1 != null && val2 != null && !val1.value.equals(val2.value)){
|
313
|
String message = "The tags '%s' and '%s' are not equal but have a similar meaning ('%s'). "
|
314
|
+ "%s was used and %s neglected";
|
315
|
message = String.format(message, val1.tag.name(), val2.tag.name(), meaning , val1.tag.name(), val2.tag.name());
|
316
|
state.getResult().addWarning(message, null, val1.location);
|
317
|
}
|
318
|
return val1 != null ? val1 : val2;
|
319
|
}
|
320
|
|
321
|
private VerbatimTimePeriod makeDate(RisReferenceImportState state, RisValue da) {
|
322
|
if (da == null){
|
323
|
return null;
|
324
|
}
|
325
|
if (! da.value.matches("([0-9]{4})?(\\/([0-9]{2})?(\\/([0-9]{2})?(\\/.*)?)?)?")){
|
326
|
String message = "Tag '%s' has incorrect format. Only exactly 'dddd/dd/dd/any text' is allowed (where d is a digit), but was '%s'";
|
327
|
message = String.format(message, da.tag.name(), da.value);
|
328
|
state.getResult().addWarning(message, null, da.location);
|
329
|
return null;
|
330
|
}
|
331
|
String[] split = da.value.split("/");
|
332
|
VerbatimTimePeriod tp = VerbatimTimePeriod.NewVerbatimInstance();
|
333
|
if (split.length > 0 && isNotBlank(split[0])){
|
334
|
tp.setStartYear(Integer.valueOf(split[0]));
|
335
|
}
|
336
|
if (split.length > 1 && isNotBlank(split[1])){
|
337
|
tp.setStartMonth(Integer.valueOf(split[1]));
|
338
|
}
|
339
|
if (split.length > 2 && isNotBlank(split[2])){
|
340
|
tp.setStartDay(Integer.valueOf(split[2]));
|
341
|
}
|
342
|
if (split.length > 3 && isNotBlank(split[3])){
|
343
|
List<String> other = Arrays.asList(split).subList(3, split.length);
|
344
|
String otherStr = CdmUtils.concat("/", other.toArray(new String[other.size()]));
|
345
|
tp.setFreeText(tp.toString() + " " + otherStr);
|
346
|
}
|
347
|
return tp;
|
348
|
}
|
349
|
|
350
|
private Integer makeYear(RisReferenceImportState state, RisValue py) {
|
351
|
if (py == null){
|
352
|
return null;
|
353
|
}
|
354
|
if (py.value.matches("[0-9]{4}")){
|
355
|
return Integer.valueOf(py.value);
|
356
|
}else{
|
357
|
String message = "Tag '%s' has incorrect format. Only exactly 4 digits are allowed, but was '%s'";
|
358
|
message = String.format(message, py.tag.name(), py.value);
|
359
|
state.getResult().addWarning(message, null, py.location);
|
360
|
return null;
|
361
|
}
|
362
|
}
|
363
|
|
364
|
private TeamOrPersonBase<?> makeAuthor(RisReferenceImportState state, List<RisValue> list) {
|
365
|
if (list.size() == 1){
|
366
|
return makePerson(state, list.get(0));
|
367
|
}else{
|
368
|
Team team = Team.NewInstance();
|
369
|
for (RisValue value : list){
|
370
|
team.addTeamMember(makePerson(state, value));
|
371
|
}
|
372
|
return team;
|
373
|
}
|
374
|
}
|
375
|
|
376
|
private Person makePerson(RisReferenceImportState state, RisValue risValue) {
|
377
|
Person person = Person.NewInstance();
|
378
|
String[] split = risValue.value.split(",");
|
379
|
if (split.length >= 1){
|
380
|
person.setFamilyName(split[0].trim());
|
381
|
}
|
382
|
if (split.length >= 2){
|
383
|
person.setGivenName(split[1].trim());
|
384
|
}
|
385
|
if (split.length >= 3){
|
386
|
person.setSuffix(split[2].trim());
|
387
|
}
|
388
|
|
389
|
return person;
|
390
|
}
|
391
|
|
392
|
/**
|
393
|
* Returns the single value for the given tag
|
394
|
* and removes the tag from the record.
|
395
|
* If more than 1 value exists this is logged
|
396
|
* as a warning.
|
397
|
*/
|
398
|
private RisValue getSingleValue(RisReferenceImportState state,
|
399
|
Map<RisReferenceTag, List<RisValue>> record,
|
400
|
RisReferenceTag tag) {
|
401
|
return getSingleValue(state, record, tag, true);
|
402
|
}
|
403
|
|
404
|
/**
|
405
|
* Returns the single value for the given tag
|
406
|
* and removes the tag from the record.
|
407
|
* If more than 1 value exists this is logged
|
408
|
* as a warning.
|
409
|
*/
|
410
|
private RisValue getSingleValue(RisReferenceImportState state,
|
411
|
Map<RisReferenceTag, List<RisValue>> record,
|
412
|
RisReferenceTag tag, boolean remove) {
|
413
|
List<RisValue> list = record.get(tag);
|
414
|
if (list == null){
|
415
|
return null;
|
416
|
}
|
417
|
assertSingle(state, list, tag);
|
418
|
if (remove){
|
419
|
record.remove(tag);
|
420
|
}
|
421
|
return list.get(0);
|
422
|
}
|
423
|
|
424
|
private List<RisValue> getListValue(Map<RisReferenceTag, List<RisValue>> record,
|
425
|
RisReferenceTag tag) {
|
426
|
List<RisValue> list = record.get(tag);
|
427
|
record.remove(tag);
|
428
|
if (list == null){
|
429
|
list = new ArrayList<>();
|
430
|
}
|
431
|
return list;
|
432
|
}
|
433
|
|
434
|
private void assertSingle(RisReferenceImportState state, List<RisValue> list, RisReferenceTag tag) {
|
435
|
if (list.size() > 1){
|
436
|
String message = "There is more than 1 tag '%s' but only 1 tag is supported by RIS format or"
|
437
|
+ " by the current import implementation.";
|
438
|
message = String.format(message, tag.name());
|
439
|
state.getResult().addWarning(message, null, list.get(0).location + "ff");
|
440
|
}else if (list.isEmpty()){
|
441
|
state.getResult().addError("A tag list was empty. This should not happen and is a programming code error");
|
442
|
}
|
443
|
}
|
444
|
|
445
|
private ReferenceType makeReferenceType(RisReferenceImportState state,
|
446
|
Map<RisReferenceTag, List<RisValue>> record) {
|
447
|
RisReferenceTag tyTag = RisReferenceTag.TY;
|
448
|
RisValue value = this.getSingleValue(state, record, tyTag, false);
|
449
|
String typeStr = value.value;
|
450
|
RisRecordType type = RisRecordType.valueOf(typeStr);
|
451
|
ReferenceType cdmType = type.getCdmReferenceType();
|
452
|
return cdmType;
|
453
|
}
|
454
|
|
455
|
@Override
|
456
|
protected boolean doCheck(RisReferenceImportState state) {
|
457
|
return true;
|
458
|
}
|
459
|
|
460
|
@Override
|
461
|
protected boolean isIgnore(RisReferenceImportState state) {
|
462
|
return false;
|
463
|
}
|
464
|
}
|