Project

General

Profile

Download (22 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2017 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.reference.ris.in;
10

    
11
import java.io.ByteArrayInputStream;
12
import java.io.InputStreamReader;
13
import java.util.ArrayList;
14
import java.util.Arrays;
15
import java.util.HashSet;
16
import java.util.List;
17
import java.util.Map;
18
import java.util.Set;
19

    
20
import org.apache.log4j.Logger;
21
import org.springframework.stereotype.Component;
22

    
23
import eu.etaxonomy.cdm.common.CdmUtils;
24
import eu.etaxonomy.cdm.common.DOI;
25
import eu.etaxonomy.cdm.common.URI;
26
import eu.etaxonomy.cdm.io.common.CdmImportBase;
27
import eu.etaxonomy.cdm.io.common.utils.ImportDeduplicationHelper;
28
import eu.etaxonomy.cdm.io.reference.ris.in.RisRecordReader.RisValue;
29
import eu.etaxonomy.cdm.model.agent.Person;
30
import eu.etaxonomy.cdm.model.agent.Team;
31
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
32
import eu.etaxonomy.cdm.model.common.Annotation;
33
import eu.etaxonomy.cdm.model.common.AnnotationType;
34
import eu.etaxonomy.cdm.model.common.Language;
35
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
36
import eu.etaxonomy.cdm.model.reference.Reference;
37
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
38
import eu.etaxonomy.cdm.model.reference.ReferenceType;
39
import eu.etaxonomy.cdm.strategy.parser.TimePeriodParser;
40

    
41
/**
42
 * @author a.mueller
43
 * @since 11.05.2017
44
 */
45
@Component
46
public class RisReferenceImport
47
        extends CdmImportBase<RisReferenceImportConfigurator, RisReferenceImportState>{
48

    
49
    private static final long serialVersionUID = 7022034669942979722L;
50
    @SuppressWarnings("unused")
51
    private static final Logger logger = Logger.getLogger(RisReferenceImport.class);
52

    
53
    @Override
54
    protected void doInvoke(RisReferenceImportState state) {
55
        RisReferenceImportConfigurator config = state.getConfig();
56
        try {
57
//            new FileReader(file)
58
            byte[] data = config.getStream();
59

    
60
            ByteArrayInputStream stream = new ByteArrayInputStream(data);
61
            InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
62
            RisRecordReader risReader = new RisRecordReader(state, reader);
63

    
64
            Set<Reference> referencesToSave = new HashSet<>();
65

    
66
            Map<RisReferenceTag, List<RisValue>> next = risReader.readRecord();
67
            while (next != RisRecordReader.EOF){
68
                Reference ref;
69
                String location = "";
70
                try {
71
                    location = recordLocation(state, next);
72
                    ref = handleSingleReference(state, next);
73

    
74
                    Reference existingRef = state.getDeduplicationHelper().getExistingReference(ref);
75
                    if (existingRef == ref){ //reference does not yet exist so the identical reference has been returned
76
                        state.getDeduplicationHelper().replaceReferenceRelatedData(ref);
77
                        referencesToSave.add(ref);
78
                        if (ref.getInReference() != null){
79
                            referencesToSave.add(ref.getInReference());
80
                        }
81
                    }else{
82
                        //merge ?
83
                    }
84
                } catch (Exception e) {
85
                    String message = "Unexpected exception during RIS Reference Import";
86
                    state.getResult().addException(e, message, location);
87
                }
88

    
89
                next = risReader.readRecord();
90
            }
91

    
92

    
93
            getReferenceService().saveOrUpdate(referencesToSave);
94
            state.getResult().addNewRecords(Reference.class.getSimpleName(), referencesToSave.size());
95

    
96
        } catch (Exception e) {
97
            String message = "Unexpected exception during RIS Reference Import";
98
            state.getResult().addException(e, message);
99
        }
100

    
101
        //unhandled
102
        Map<RisReferenceTag, Integer> unhandled = state.getUnhandled();
103
        for (RisReferenceTag tag : unhandled.keySet()){
104
            String message = "RIS tag %s (%s) not yet handled. n = %d";
105
            message = String .format(message, tag.name(), tag.getDescription(), unhandled.get(tag));
106
            state.getResult().addWarning(message);
107
        }
108
    }
109

    
110
    private Reference handleSingleReference(RisReferenceImportState state,
111
            Map<RisReferenceTag, List<RisValue>> record) {
112

    
113
        //type
114
        ReferenceType type = makeReferenceType(state, record);
115
        Reference ref = ReferenceFactory.newReference(type);
116
        Reference inRef = null;
117
        if (hasInRef(ref)){
118
            ReferenceType inRefType =
119
                    type == ReferenceType.Article ? ReferenceType.Journal:
120
                    type == ReferenceType.BookSection ? ReferenceType.Book :
121
                        ReferenceType.Generic;
122
            inRef = ReferenceFactory.newReference(inRefType);
123
            ref.setInReference(inRef);
124
        }
125
        Reference higherRef = (inRef == null) ? ref : inRef;
126

    
127
        //titles
128
        handleTitle(state, record, ref, inRef, higherRef);
129

    
130
        //authors
131
        handleAuthors(state, record, ref, inRef);
132

    
133
        //Date
134
//        RisValue y1 = getSingleValue(state, record, RisReferenceTag.Y1); //Primary Date
135
        RisValue py = getSingleValue(state, record, RisReferenceTag.PY);
136
        RisValue da = getSingleValue(state, record, RisReferenceTag.DA);
137
        Integer year = makeYear(state, py);
138
        VerbatimTimePeriod date = makeDate(state, da);
139
        date = assertDateYear(state, year, date, py);
140
        ref.setDatePublished(date);
141
        //TODO y1 not yet handled
142

    
143
        //Note
144
        RisValue n1 = getSingleValue(state, record, RisReferenceTag.N1); //Note
145
        if (n1 != null){
146
            Annotation annotation = Annotation.NewInstance(n1.value, AnnotationType.EDITORIAL(), Language.DEFAULT());
147
            ref.addAnnotation(annotation);
148
        }
149

    
150
        //DOI
151
        handleDoi(state, record, ref);
152

    
153
        //UR
154
        RisValue ur = getSingleValue(state, record, RisReferenceTag.UR); //URL
155
        if (ur != null){
156
            URI uri;
157
            try {
158
                String urStr = ur.value;
159
                uri = URI.create(urStr);
160
                ref.setUri(uri);
161
            } catch (Exception e) {
162
                String message = "URL could not be recognized: " + ur.value;
163
                state.getResult().addWarning(message, null, ur.location);
164
            }
165
        }
166

    
167
        //Pages
168
        RisValue sp = getSingleValue(state, record, RisReferenceTag.SP);
169
        RisValue ep = getSingleValue(state, record, RisReferenceTag.EP);
170
        String pages = CdmUtils.concat("-", sp != null ? sp.value : null, ep != null ? ep.value : null);
171
        ref.setPages(pages);
172

    
173
        //Volume
174
        RisValue vl = getSingleValue(state, record, RisReferenceTag.VL);
175
        RisValue is = getSingleValue(state, record, RisReferenceTag.IS);
176
        String vol = (vl == null)? "": vl.value + (is != null ? "("+ is.value + ")": "");
177
        if (inRef != null && inRef.getType() == ReferenceType.Book){
178
            inRef.setVolume(vol);
179
        }else{
180
            ref.setVolume(vol);
181
        }
182

    
183
        //Publisher
184
        RisValue pb = getSingleValue(state, record, RisReferenceTag.PB);
185
        if (pb != null){
186
            higherRef.setPublisher(pb.value);
187
        }
188

    
189
        //CY - Place published
190
        RisValue cy = getSingleValue(state, record, RisReferenceTag.CY);
191
        if (cy != null){
192
            higherRef.setPlacePublished(cy.value);
193
        }
194

    
195
        //Abstract
196
        RisValue ab = getSingleValue(state, record, RisReferenceTag.AB);
197
        RisValue n2 = getSingleValue(state, record, RisReferenceTag.N2);
198
        RisValue abst = assertEqual(state, "Abstract", ab, n2);
199
        if (abst != null){
200
            ref.setReferenceAbstract(abst.value);
201
        }
202

    
203
        //ISSN/ISBN
204
        RisValue sn = getSingleValue(state, record, RisReferenceTag.SN);
205
        if (sn != null){
206
            if (higherRef.getType() == ReferenceType.Journal){
207
                higherRef.setIssn(sn.value);
208
            }else{
209
                higherRef.setIsbn(sn.value);
210
            }
211
        }
212

    
213
        //ID
214
        RisValue id = getSingleValue(state, record, RisReferenceTag.ID);
215
        String idStr = id != null? id.value: null;
216
        String recLoc = recordLocation(state, record);
217
        ref.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
218
        if (inRef != null){
219
            inRef.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
220
        }
221

    
222
        //remove
223
        record.remove(RisReferenceTag.ER);
224
        record.remove(RisReferenceTag.TY);
225

    
226
        for (RisReferenceTag tag : record.keySet()){
227
//            String message = "RIS Tag " + tag.name() +  " not yet handled";
228
//            state.getResult().addWarning(message, record.get(tag).get(0).location);
229
            state.addUnhandled(tag);
230

    
231
            //TODO add as annotation or extension
232
        }
233

    
234
        return ref;
235
    }
236

    
237
    private void handleDoi(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record, Reference ref) {
238
        RisValue doiVal = getSingleValue(state, record, RisReferenceTag.DO); //Doi
239
        if (doiVal != null){
240
            DOI doi;
241
            try {
242
                String doiStr = doiVal.value;
243
                if (doiStr.toLowerCase().startsWith("doi ")){
244
                    doiStr = doiStr.substring(4).trim();
245
                }
246
                doi = DOI.fromString(doiStr);
247
                ref.setDoi(doi);
248
            } catch (IllegalArgumentException e) {
249
                String message = "DOI could not be recognized: " + doiVal.value;
250
                state.getResult().addWarning(message, null, doiVal.location);
251
            }
252
        }
253
    }
254

    
255
    private void handleTitle(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record, Reference ref,
256
            Reference inRef, Reference higherRef) {
257
        //Title
258
        RisValue t1 = getSingleValue(state, record, RisReferenceTag.T1);
259
        RisValue ti = getSingleValue(state, record, RisReferenceTag.TI);
260
        RisValue title = assertEqual(state, "title", t1, ti);
261
        if (title != null){
262
            ref.setTitle(title.value);
263
        }
264

    
265
        //Journal title
266
        RisValue t2 = getSingleValue(state, record, RisReferenceTag.T2); //Secondary Title (journal title, if applicable)
267

    
268
        if (higherRef.getType() == ReferenceType.Journal){
269
            RisValue jf = getSingleValue(state, record, RisReferenceTag.JF); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
270
            RisValue jo = getSingleValue(state, record, RisReferenceTag.JO); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
271
            RisValue jf_jo = assertEqual(state, "Journal/Periodical name: full format", jf, jo);
272
            RisValue journalTitle = assertEqual(state, "Journal title", t2, jf_jo);
273
            if (journalTitle != null){
274
                higherRef.setTitle(journalTitle.value);
275
            }
276
        }else if (t2 != null && inRef != null){
277
            inRef.setTitle(t2.value);
278
        }else if (t2 != null){
279
            String message = "The tag %s ('%s') exists but the reference type usually has no in-reference."
280
                    + "This part of the title was neglected: %s";
281
            message = String.format(message, t2.tag.name(), t2.tag.getDescription(), t2.value);
282
            state.getResult().addWarning(message, null, t2.location);
283
        }else if (inRef != null){
284
            String message = "The reference type typically has an inreference but no secondary title (tag T2) was given.";
285
            state.getResult().addWarning(message, null, (title != null)? title.location : null);
286
        }
287

    
288
        //ST  (remove as same as TI or T1), not handled otherwise
289
        RisValue st = getSingleValue(state, record, RisReferenceTag.ST, false); //Short title
290
        if (st != null && st.value.equals(ref.getTitle())){
291
            record.remove(RisReferenceTag.ST);
292
        }
293
    }
294

    
295
    private void handleAuthors(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record,
296
            Reference ref, Reference inRef) {
297
        List<RisValue> authorList = getListValue(record, RisReferenceTag.AU);
298
        if (!authorList.isEmpty()){
299
            TeamOrPersonBase<?> author = makeAuthor(state, authorList);
300
            ref.setAuthorship(author);
301
        }
302
        List<RisValue> secondaryAuthorList = getListValue(record, RisReferenceTag.A2);
303
        if (!secondaryAuthorList.isEmpty()){
304
            if (inRef != null){
305
                if (inRef.getType() != ReferenceType.Journal){
306
                    TeamOrPersonBase<?> secAuthor = makeAuthor(state, secondaryAuthorList);
307
                    inRef.setAuthorship(secAuthor);
308
                }else{
309
                    String message = "The tag %s ('%s') exists but the in-reference type is 'journal' which typically has no author."
310
                            + "The secondary author(s) was/were neglected: %s";
311
                    message = String.format(message, RisReferenceTag.AU.name(), RisReferenceTag.AU.getDescription(), secondaryAuthorList.toString());
312
                    state.getResult().addWarning(message, null, secondaryAuthorList.get(0).location);
313
                }
314
            }else{
315
                String message = "The tag %s ('%s') exists but the reference type usually has no in-reference."
316
                        + "The secondary author(s) was/were neglected: %s";
317
                message = String.format(message, RisReferenceTag.AU.name(), RisReferenceTag.AU.getDescription(), secondaryAuthorList.toString());
318
                state.getResult().addWarning(message, null, secondaryAuthorList.get(0).location);
319
            }
320
        }
321
    }
322

    
323
    private boolean hasInRef(Reference ref) {
324
        return ref.getType() == ReferenceType.BookSection || ref.getType() == ReferenceType.Article ;
325
    }
326

    
327
    private String recordLocation(RisReferenceImportState state,
328
            Map<RisReferenceTag, List<RisValue>> record) {
329
        RisValue typeTag = this.getSingleValue(state, record, RisReferenceTag.TY, false);
330
        RisValue erTag = this.getSingleValue(state, record, RisReferenceTag.ER, false);
331

    
332
        String start = typeTag == null ? "??" : typeTag.location;
333
        String end = erTag == null ? "??" : erTag.location;
334

    
335
        String result = "line " + CdmUtils.concat(" - ", start, end);
336

    
337
        return result;
338
    }
339

    
340
    private VerbatimTimePeriod assertDateYear(RisReferenceImportState state, Integer year, VerbatimTimePeriod date, RisValue py) {
341
        if (year == null && date == null){
342
            return null;
343
        }else if (year == null){
344
            return date;
345
        }else if (date == null){
346
            return TimePeriodParser.parseStringVerbatim(String.valueOf(year));
347
        }else{
348
            if  (!year.equals(date.getStartYear())){
349
                if (date.getStartYear() == null){
350
                    date.setStartYear(year);
351
                }else if (isNotBlank(date.getFreeText())){
352
                    date.setStartYear(year);  //does this happen at all?
353
                    String message = "Year 'PY' and date 'DA' are not consistent. PY is neglected.";
354
                    state.getResult().addWarning(message, null, py.location);
355
                    return date;
356
                }else{
357
                    String message = "Year 'PY' and date 'DA' are not consistent. DA is used for freetext and PY is used for (start) year.";
358
                    state.getResult().addWarning(message, null, py.location);
359
                    return date;
360
                }
361
            }
362
            return date;
363
        }
364
    }
365

    
366
    /**
367
     * If val1 and val2 are both <code>not null</code> and not equal a warning is logged.
368
     * @return val1 if val1 is not null, val2 otherwise
369
     */
370
    private RisValue assertEqual(RisReferenceImportState state, String meaning, RisValue val1, RisValue val2) {
371
        if (val1 != null && val2 != null && !val1.value.equals(val2.value)){
372
            String message = "The tags '%s' and '%s' are not equal but have a similar meaning ('%s'). "
373
                    + "%s was used and %s neglected";
374
            message = String.format(message, val1.tag.name(), val2.tag.name(), meaning , val1.tag.name(), val2.tag.name());
375
            state.getResult().addWarning(message, null, val1.location);
376
        }
377
        return val1 != null ? val1 : val2;
378
    }
379

    
380
    private VerbatimTimePeriod makeDate(RisReferenceImportState state, RisValue da) {
381
        if (da == null){
382
            return null;
383
        }
384
        if (! da.value.matches("([0-9]{4})?(\\/([0-9]{2})?(\\/([0-9]{2})?(\\/.*)?)?)?")){
385
            String message = "Tag '%s' has incorrect format. Only exactly 'dddd/dd/dd/any text' is allowed (where d is a digit), but was '%s'";
386
            message = String.format(message, da.tag.name(), da.value);
387
            state.getResult().addWarning(message, null, da.location);
388
            return null;
389
        }
390
        String[] split = da.value.split("/");
391
        VerbatimTimePeriod tp = VerbatimTimePeriod.NewVerbatimInstance();
392
        if (split.length > 0 && isNotBlank(split[0])){
393
            tp.setStartYear(Integer.valueOf(split[0]));
394
        }
395
        if (split.length > 1 && isNotBlank(split[1])){
396
            tp.setStartMonth(Integer.valueOf(split[1]));
397
        }
398
        if (split.length > 2 && isNotBlank(split[2])){
399
            tp.setStartDay(Integer.valueOf(split[2]));
400
        }
401
        if (split.length > 3 && isNotBlank(split[3])){
402
            List<String> other = Arrays.asList(split).subList(3, split.length);
403
            String otherStr = CdmUtils.concat("/", other.toArray(new String[other.size()]));
404
            tp.setFreeText(tp.toString() + " " + otherStr);
405
        }
406
        return tp;
407
    }
408

    
409
    private Integer makeYear(RisReferenceImportState state, RisValue py) {
410
        if (py == null){
411
            return null;
412
        }
413
        if (py.value.matches("[0-9]{4}")){
414
            return Integer.valueOf(py.value);
415
        }else{
416
            String message = "Tag '%s' has incorrect format. Only exactly 4 digits are allowed, but was '%s'";
417
            message = String.format(message, py.tag.name(), py.value);
418
            state.getResult().addWarning(message, null, py.location);
419
            return null;
420
        }
421
    }
422

    
423
    private TeamOrPersonBase<?> makeAuthor(RisReferenceImportState state, List<RisValue> list) {
424
        if (list.size() == 1){
425
            return makePerson(state, list.get(0));
426
        }else{
427
            Team team = Team.NewInstance();
428
            for (RisValue value : list){
429
                team.addTeamMember(makePerson(state, value));
430
            }
431
            return team;
432
        }
433
    }
434

    
435
    private Person makePerson(RisReferenceImportState state, RisValue risValue) {
436
        Person person = Person.NewInstance();
437
        String[] split = risValue.value.split(",");
438
        if (split.length >= 1){
439
            person.setFamilyName(split[0].trim());
440
        }
441
        if (split.length >= 2){
442
            String givenNameOrInitial = split[1].trim();
443
            if (givenNameOrInitial.matches("[A-Za-z]\\.(\\s*[A-Za-z]\\.)*")){
444
                person.setInitials(givenNameOrInitial);
445
            }else{
446
                person.setGivenName(givenNameOrInitial);
447
            }
448
        }
449
        if (split.length >= 3){
450
            person.setSuffix(split[2].trim());
451
        }
452

    
453
        return person;
454
    }
455

    
456
    /**
457
     * Returns the single value for the given tag
458
     * and removes the tag from the record.
459
     * If more than 1 value exists this is logged
460
     * as a warning.
461
     */
462
    private RisValue getSingleValue(RisReferenceImportState state,
463
            Map<RisReferenceTag, List<RisValue>> record,
464
            RisReferenceTag tag) {
465
        return getSingleValue(state, record, tag, true);
466
    }
467

    
468
    /**
469
     * Returns the single value for the given tag
470
     * and removes the tag from the record.
471
     * If more than 1 value exists this is logged
472
     * as a warning.
473
     */
474
    private RisValue getSingleValue(RisReferenceImportState state,
475
            Map<RisReferenceTag, List<RisValue>> record,
476
            RisReferenceTag tag, boolean remove) {
477

    
478
        List<RisValue> list = record.get(tag);
479
        if (list == null){
480
            return null;
481
        }
482
        assertSingle(state, list, tag);
483
        if (remove){
484
            record.remove(tag);
485
        }
486
        return list.get(0);
487
    }
488

    
489
    private List<RisValue> getListValue(Map<RisReferenceTag, List<RisValue>> record,
490
            RisReferenceTag tag) {
491

    
492
        List<RisValue> list = record.get(tag);
493
        record.remove(tag);
494
        if (list == null){
495
            list = new ArrayList<>();
496
        }
497
        return list;
498
    }
499

    
500
    private void assertSingle(RisReferenceImportState state, List<RisValue> list, RisReferenceTag tag) {
501
        if (list.size() > 1){
502
            String message = "There is more than 1 tag '%s' but only 1 tag is supported by RIS format or"
503
                    + " by the current import implementation.";
504
            message = String.format(message, tag.name());
505
            state.getResult().addWarning(message, null, list.get(0).location + "ff");
506
        }else if (list.isEmpty()){
507
            state.getResult().addError("A tag list was empty. This should not happen and is a programming code error");
508
        }
509
    }
510

    
511
    private ReferenceType makeReferenceType(RisReferenceImportState state,
512
            Map<RisReferenceTag, List<RisValue>> record) {
513
        RisReferenceTag tyTag = RisReferenceTag.TY;
514
        RisValue value = this.getSingleValue(state, record, tyTag, false);
515
        String typeStr = value.value;
516
        RisRecordType type = RisRecordType.valueOf(typeStr);
517
        ReferenceType cdmType = type.getCdmReferenceType();
518
        return cdmType;
519
    }
520

    
521
    @Override
522
    public ImportDeduplicationHelper createDeduplicationHelper(RisReferenceImportState state){
523
        ImportDeduplicationHelper result = super.createDeduplicationHelper(state);
524
        result.setMaxCountFullLoad(state.getConfig().getDeduplicationMaxCountForFullLoad());
525
        return result;
526
    }
527

    
528
    @Override
529
    protected boolean doCheck(RisReferenceImportState state) {
530
        return true;
531
    }
532

    
533
    @Override
534
    protected boolean isIgnore(RisReferenceImportState state) {
535
        return false;
536
    }
537
}
(3-3/6)