Project

General

Profile

Download (23.2 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2017 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.reference.ris.in;
10

    
11
import java.io.ByteArrayInputStream;
12
import java.io.InputStreamReader;
13
import java.util.ArrayList;
14
import java.util.Arrays;
15
import java.util.HashSet;
16
import java.util.List;
17
import java.util.Map;
18
import java.util.Set;
19

    
20
import org.apache.log4j.Logger;
21
import org.springframework.stereotype.Component;
22

    
23
import eu.etaxonomy.cdm.common.CdmUtils;
24
import eu.etaxonomy.cdm.common.DOI;
25
import eu.etaxonomy.cdm.common.URI;
26
import eu.etaxonomy.cdm.io.common.CdmImportBase;
27
import eu.etaxonomy.cdm.io.common.utils.ImportDeduplicationHelper;
28
import eu.etaxonomy.cdm.io.reference.ris.in.RisRecordReader.RisValue;
29
import eu.etaxonomy.cdm.model.agent.Person;
30
import eu.etaxonomy.cdm.model.agent.Team;
31
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
32
import eu.etaxonomy.cdm.model.common.Annotation;
33
import eu.etaxonomy.cdm.model.common.AnnotationType;
34
import eu.etaxonomy.cdm.model.common.Language;
35
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
36
import eu.etaxonomy.cdm.model.reference.Reference;
37
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
38
import eu.etaxonomy.cdm.model.reference.ReferenceType;
39
import eu.etaxonomy.cdm.strategy.parser.TimePeriodParser;
40

    
41
/**
42
 * @author a.mueller
43
 * @since 11.05.2017
44
 */
45
@Component
46
public class RisReferenceImport
47
        extends CdmImportBase<RisReferenceImportConfigurator, RisReferenceImportState>{
48

    
49
    private static final long serialVersionUID = 7022034669942979722L;
50
    @SuppressWarnings("unused")
51
    private static final Logger logger = Logger.getLogger(RisReferenceImport.class);
52

    
53
    @Override
54
    protected void doInvoke(RisReferenceImportState state) {
55
        RisReferenceImportConfigurator config = state.getConfig();
56
        try {
57
//            new FileReader(file)
58
            byte[] data = config.getStream();
59

    
60
            ByteArrayInputStream stream = new ByteArrayInputStream(data);
61
            InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
62
            RisRecordReader risReader = new RisRecordReader(state, reader);
63

    
64
            Set<Reference> referencesToSave = new HashSet<>();
65

    
66
            Map<RisReferenceTag, List<RisValue>> next = risReader.readRecord();
67
            while (next != RisRecordReader.EOF){
68
                Reference ref;
69
                String location = "";
70
                try {
71
                    location = recordLocation(state, next);
72
                    ref = handleSingleReference(state, next);
73

    
74
                    Reference existingRef = state.getDeduplicationHelper().getExistingReference(ref);
75
                    if (existingRef == ref){ //reference does not yet exist so the identical reference has been returned
76
                        state.getDeduplicationHelper().replaceReferenceRelatedData(ref);
77
                        referencesToSave.add(ref);
78
                        if (ref.getInReference() != null){
79
                            referencesToSave.add(ref.getInReference());
80
                        }
81
                    }else{
82
                        //merge ?
83
                    }
84
                } catch (Exception e) {
85
                    String message = "Unexpected exception during RIS Reference Import";
86
                    state.getResult().addException(e, message, location);
87
                }
88

    
89
                next = risReader.readRecord();
90
            }
91

    
92
            //TODO handle result counts more generic
93
            state.getResult().addNewRecords(Reference.class.getSimpleName(), referencesToSave.size());
94
            for (Reference ref : referencesToSave){
95
                if (ref.getAuthorship() != null && !ref.getAuthorship().isPersited()){
96
                    TeamOrPersonBase<?> newAuthor = ref.getAuthorship();
97
                    state.getResult().addNewRecord(newAuthor);
98
                    if (newAuthor instanceof Team){
99
                        for (Person member : ((Team)newAuthor).getTeamMembers()){
100
                            if (!member.isPersited()){
101
                                state.getResult().addNewRecord(member);
102
                            }
103
                        }
104
                    }
105
                }
106
            }
107
            getReferenceService().saveOrUpdate(referencesToSave);
108

    
109
        } catch (Exception e) {
110
            String message = "Unexpected exception during RIS Reference Import";
111
            state.getResult().addException(e, message);
112
        }
113

    
114
        //unhandled
115
        Map<RisReferenceTag, Integer> unhandled = state.getUnhandled();
116
        for (RisReferenceTag tag : unhandled.keySet()){
117
            String message = "RIS tag %s (%s) not yet handled. n = %d";
118
            message = String .format(message, tag.name(), tag.getDescription(), unhandled.get(tag));
119
            state.getResult().addWarning(message);
120
        }
121
    }
122

    
123
    private Reference handleSingleReference(RisReferenceImportState state,
124
            Map<RisReferenceTag, List<RisValue>> record) {
125

    
126
        //type
127
        ReferenceType type = makeReferenceType(state, record);
128
        Reference ref = ReferenceFactory.newReference(type);
129
        Reference inRef = null;
130
        if (hasInRef(ref)){
131
            ReferenceType inRefType =
132
                    type == ReferenceType.Article ? ReferenceType.Journal:
133
                    type == ReferenceType.BookSection ? ReferenceType.Book :
134
                        ReferenceType.Generic;
135
            inRef = ReferenceFactory.newReference(inRefType);
136
            ref.setInReference(inRef);
137
        }
138
        Reference higherRef = (inRef == null) ? ref : inRef;
139

    
140
        //titles
141
        handleTitle(state, record, ref, inRef, higherRef);
142

    
143
        //authors
144
        handleAuthors(state, record, ref, inRef);
145

    
146
        //Date
147
//        RisValue y1 = getSingleValue(state, record, RisReferenceTag.Y1); //Primary Date
148
        RisValue py = getSingleValue(state, record, RisReferenceTag.PY);
149
        RisValue da = getSingleValue(state, record, RisReferenceTag.DA);
150
        Integer year = makeYear(state, py);
151
        VerbatimTimePeriod date = makeDate(state, da);
152
        date = assertDateYear(state, year, date, py);
153
        ref.setDatePublished(date);
154
        //TODO y1 not yet handled
155

    
156
        //Note
157
        RisValue n1 = getSingleValue(state, record, RisReferenceTag.N1); //Note
158
        if (n1 != null){
159
            Annotation annotation = Annotation.NewInstance(n1.value, AnnotationType.EDITORIAL(), Language.DEFAULT());
160
            ref.addAnnotation(annotation);
161
        }
162

    
163
        //DOI
164
        handleDoi(state, record, ref);
165

    
166
        //UR
167
        RisValue ur = getSingleValue(state, record, RisReferenceTag.UR); //URL
168
        if (ur != null){
169
            URI uri;
170
            try {
171
                String urStr = ur.value;
172
                uri = URI.create(urStr);
173
                ref.setUri(uri);
174
            } catch (Exception e) {
175
                String message = "URL could not be recognized: " + ur.value;
176
                state.getResult().addWarning(message, null, ur.location);
177
            }
178
        }
179

    
180
        //Pages
181
        RisValue sp = getSingleValue(state, record, RisReferenceTag.SP);
182
        RisValue ep = getSingleValue(state, record, RisReferenceTag.EP);
183
        String pages = CdmUtils.concat("-", sp != null ? sp.value : null, ep != null ? ep.value : null);
184
        ref.setPages(pages);
185

    
186
        //Volume
187
        RisValue vl = getSingleValue(state, record, RisReferenceTag.VL);
188
        RisValue is = getSingleValue(state, record, RisReferenceTag.IS);
189
        String vol = (vl == null)? "": vl.value + (is != null ? "("+ is.value + ")": "");
190
        if (inRef != null && inRef.getType() == ReferenceType.Book){
191
            inRef.setVolume(vol);
192
        }else{
193
            ref.setVolume(vol);
194
        }
195

    
196
        //Publisher
197
        RisValue pb = getSingleValue(state, record, RisReferenceTag.PB);
198
        if (pb != null){
199
            higherRef.setPublisher(pb.value);
200
        }
201

    
202
        //CY - Place published
203
        RisValue cy = getSingleValue(state, record, RisReferenceTag.CY);
204
        if (cy != null){
205
            higherRef.setPlacePublished(cy.value);
206
        }
207

    
208
        //Abstract
209
        RisValue ab = getSingleValue(state, record, RisReferenceTag.AB);
210
        RisValue n2 = getSingleValue(state, record, RisReferenceTag.N2);
211
        RisValue abst = assertEqual(state, "Abstract", ab, n2);
212
        if (abst != null){
213
            ref.setReferenceAbstract(abst.value);
214
        }
215

    
216
        //ISSN/ISBN
217
        RisValue sn = getSingleValue(state, record, RisReferenceTag.SN);
218
        if (sn != null){
219
            if (higherRef.getType() == ReferenceType.Journal){
220
                higherRef.setIssn(sn.value);
221
            }else{
222
                higherRef.setIsbn(sn.value);
223
            }
224
        }
225

    
226
        //ID
227
        RisValue id = getSingleValue(state, record, RisReferenceTag.ID);
228
        String idStr = id != null? id.value: null;
229
        String recLoc = recordLocation(state, record);
230
        ref.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
231
        if (inRef != null){
232
            inRef.addImportSource(idStr, null, state.getConfig().getSourceReference(), recLoc);
233
        }
234

    
235
        //remove
236
        record.remove(RisReferenceTag.ER);
237
        record.remove(RisReferenceTag.TY);
238

    
239
        for (RisReferenceTag tag : record.keySet()){
240
//            String message = "RIS Tag " + tag.name() +  " not yet handled";
241
//            state.getResult().addWarning(message, record.get(tag).get(0).location);
242
            state.addUnhandled(tag);
243

    
244
            //TODO add as annotation or extension
245
        }
246

    
247
        return ref;
248
    }
249

    
250
    private void handleDoi(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record, Reference ref) {
251
        RisValue doiVal = getSingleValue(state, record, RisReferenceTag.DO); //Doi
252
        if (doiVal != null){
253
            DOI doi;
254
            try {
255
                String doiStr = doiVal.value;
256
                if (doiStr.toLowerCase().startsWith("doi ")){
257
                    doiStr = doiStr.substring(4).trim();
258
                }
259
                doi = DOI.fromString(doiStr);
260
                ref.setDoi(doi);
261
            } catch (IllegalArgumentException e) {
262
                String message = "DOI could not be recognized: " + doiVal.value;
263
                state.getResult().addWarning(message, null, doiVal.location);
264
            }
265
        }
266
    }
267

    
268
    private void handleTitle(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record, Reference ref,
269
            Reference inRef, Reference higherRef) {
270
        //Title
271
        RisValue t1 = getSingleValue(state, record, RisReferenceTag.T1);
272
        RisValue ti = getSingleValue(state, record, RisReferenceTag.TI);
273
        RisValue title = assertEqual(state, "title", t1, ti);
274
        if (title != null){
275
            ref.setTitle(title.value);
276
        }
277

    
278
        //Journal title
279
        RisValue t2 = getSingleValue(state, record, RisReferenceTag.T2); //Secondary Title (journal title, if applicable)
280

    
281
        if (higherRef.getType() == ReferenceType.Journal){
282
            RisValue jf = getSingleValue(state, record, RisReferenceTag.JF); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
283
            RisValue jo = getSingleValue(state, record, RisReferenceTag.JO); //Journal/Periodical name: full format. This is an alphanumeric field of up to 255 characters.
284
            RisValue jf_jo = assertEqual(state, "Journal/Periodical name: full format", jf, jo);
285
            RisValue journalTitle = assertEqual(state, "Journal title", t2, jf_jo);
286
            if (journalTitle != null){
287
                higherRef.setTitle(journalTitle.value);
288
            }
289
        }else if (t2 != null && inRef != null){
290
            inRef.setTitle(t2.value);
291
        }else if (t2 != null){
292
            String message = "The tag %s ('%s') exists but the reference type usually has no in-reference."
293
                    + "This part of the title was neglected: %s";
294
            message = String.format(message, t2.tag.name(), t2.tag.getDescription(), t2.value);
295
            state.getResult().addWarning(message, null, t2.location);
296
        }else if (inRef != null){
297
            String message = "The reference type typically has an inreference but no secondary title (tag T2) was given.";
298
            state.getResult().addWarning(message, null, (title != null)? title.location : null);
299
        }
300

    
301
        //ST  (remove as same as TI or T1), not handled otherwise
302
        RisValue st = getSingleValue(state, record, RisReferenceTag.ST, false); //Short title
303
        if (st != null && st.value.equals(ref.getTitle())){
304
            record.remove(RisReferenceTag.ST);
305
        }
306
    }
307

    
308
    private void handleAuthors(RisReferenceImportState state, Map<RisReferenceTag, List<RisValue>> record,
309
            Reference ref, Reference inRef) {
310
        List<RisValue> authorList = getListValue(record, RisReferenceTag.AU);
311
        if (!authorList.isEmpty()){
312
            TeamOrPersonBase<?> author = makeAuthor(state, authorList, record);
313
            ref.setAuthorship(author);
314
        }
315
        List<RisValue> secondaryAuthorList = getListValue(record, RisReferenceTag.A2);
316
        if (!secondaryAuthorList.isEmpty()){
317
            if (inRef != null){
318
                if (inRef.getType() != ReferenceType.Journal){
319
                    TeamOrPersonBase<?> secAuthor = makeAuthor(state, secondaryAuthorList, record);
320
                    inRef.setAuthorship(secAuthor);
321
                }else{
322
                    String message = "The tag %s ('%s') exists but the in-reference type is 'journal' which typically has no author."
323
                            + "The secondary author(s) was/were neglected: %s";
324
                    message = String.format(message, RisReferenceTag.AU.name(), RisReferenceTag.AU.getDescription(), secondaryAuthorList.toString());
325
                    state.getResult().addWarning(message, null, secondaryAuthorList.get(0).location);
326
                }
327
            }else{
328
                String message = "The tag %s ('%s') exists but the reference type usually has no in-reference."
329
                        + "The secondary author(s) was/were neglected: %s";
330
                message = String.format(message, RisReferenceTag.AU.name(), RisReferenceTag.AU.getDescription(), secondaryAuthorList.toString());
331
                state.getResult().addWarning(message, null, secondaryAuthorList.get(0).location);
332
            }
333
        }
334
    }
335

    
336
    private boolean hasInRef(Reference ref) {
337
        return ref.getType() == ReferenceType.BookSection || ref.getType() == ReferenceType.Article ;
338
    }
339

    
340
    private String recordLocation(RisReferenceImportState state,
341
            Map<RisReferenceTag, List<RisValue>> record) {
342
        RisValue typeTag = this.getSingleValue(state, record, RisReferenceTag.TY, false);
343
        RisValue erTag = this.getSingleValue(state, record, RisReferenceTag.ER, false);
344

    
345
        String start = typeTag == null ? "??" : typeTag.location;
346
        String end = erTag == null ? "??" : erTag.location;
347

    
348
        String result = CdmUtils.concat(" - ", start, end);
349

    
350
        return result;
351
    }
352

    
353
    private VerbatimTimePeriod assertDateYear(RisReferenceImportState state, Integer year, VerbatimTimePeriod date, RisValue py) {
354
        if (year == null && date == null){
355
            return null;
356
        }else if (year == null){
357
            return date;
358
        }else if (date == null){
359
            return TimePeriodParser.parseStringVerbatim(String.valueOf(year));
360
        }else{
361
            if  (!year.equals(date.getStartYear())){
362
                if (date.getStartYear() == null){
363
                    date.setStartYear(year);
364
                }else if (isNotBlank(date.getFreeText())){
365
                    date.setStartYear(year);  //does this happen at all?
366
                    String message = "Year 'PY' and date 'DA' are not consistent. PY is neglected.";
367
                    state.getResult().addWarning(message, null, py.location);
368
                    return date;
369
                }else{
370
                    String message = "Year 'PY' and date 'DA' are not consistent. DA is used for freetext and PY is used for (start) year.";
371
                    state.getResult().addWarning(message, null, py.location);
372
                    return date;
373
                }
374
            }
375
            return date;
376
        }
377
    }
378

    
379
    /**
380
     * If val1 and val2 are both <code>not null</code> and not equal a warning is logged.
381
     * @return val1 if val1 is not null, val2 otherwise
382
     */
383
    private RisValue assertEqual(RisReferenceImportState state, String meaning, RisValue val1, RisValue val2) {
384
        if (val1 != null && val2 != null && !val1.value.equals(val2.value)){
385
            String message = "The tags '%s' and '%s' are not equal but have a similar meaning ('%s'). "
386
                    + "%s was used and %s neglected";
387
            message = String.format(message, val1.tag.name(), val2.tag.name(), meaning , val1.tag.name(), val2.tag.name());
388
            state.getResult().addWarning(message, null, val1.location);
389
        }
390
        return val1 != null ? val1 : val2;
391
    }
392

    
393
    private VerbatimTimePeriod makeDate(RisReferenceImportState state, RisValue da) {
394
        if (da == null){
395
            return null;
396
        }
397
        if (! da.value.matches("([0-9]{4})?(\\/([0-9]{2})?(\\/([0-9]{2})?(\\/.*)?)?)?")){
398
            String message = "Tag '%s' has incorrect format. Only exactly 'dddd/dd/dd/any text' is allowed (where d is a digit), but was '%s'";
399
            message = String.format(message, da.tag.name(), da.value);
400
            state.getResult().addWarning(message, null, da.location);
401
            return null;
402
        }
403
        String[] split = da.value.split("/");
404
        VerbatimTimePeriod tp = VerbatimTimePeriod.NewVerbatimInstance();
405
        if (split.length > 0 && isNotBlank(split[0])){
406
            tp.setStartYear(Integer.valueOf(split[0]));
407
        }
408
        if (split.length > 1 && isNotBlank(split[1])){
409
            tp.setStartMonth(Integer.valueOf(split[1]));
410
        }
411
        if (split.length > 2 && isNotBlank(split[2])){
412
            tp.setStartDay(Integer.valueOf(split[2]));
413
        }
414
        if (split.length > 3 && isNotBlank(split[3])){
415
            List<String> other = Arrays.asList(split).subList(3, split.length);
416
            String otherStr = CdmUtils.concat("/", other.toArray(new String[other.size()]));
417
            tp.setFreeText(tp.toString() + " " + otherStr);
418
        }
419
        return tp;
420
    }
421

    
422
    private Integer makeYear(RisReferenceImportState state, RisValue py) {
423
        if (py == null){
424
            return null;
425
        }
426
        if (py.value.matches("[0-9]{4}")){
427
            return Integer.valueOf(py.value);
428
        }else{
429
            String message = "Tag '%s' has incorrect format. Only exactly 4 digits are allowed, but was '%s'";
430
            message = String.format(message, py.tag.name(), py.value);
431
            state.getResult().addWarning(message, null, py.location);
432
            return null;
433
        }
434
    }
435

    
436
    private TeamOrPersonBase<?> makeAuthor(RisReferenceImportState state, List<RisValue> list, Map<RisReferenceTag, List<RisValue>> record) {
437
        if (list.size() == 1){
438
            return makePerson(state, list.get(0), record);
439
        }else{
440
            Team team = Team.NewInstance();
441

    
442
            for (RisValue value : list){
443
                team.addTeamMember(makePerson(state, value, record));
444
            }
445

    
446
            //source
447
            String recordLocation = recordLocation(state, record);
448
            team.addImportSource(null, null, state.getConfig().getSourceReference(), recordLocation);
449

    
450
            return team;
451
        }
452
    }
453

    
454
    private Person makePerson(RisReferenceImportState state, RisValue risValue, Map<RisReferenceTag, List<RisValue>> record) {
455
        Person person = Person.NewInstance();
456
        String[] split = risValue.value.split(",");
457
        if (split.length >= 1){
458
            person.setFamilyName(split[0].trim());
459
        }
460
        if (split.length >= 2){
461
            String givenNameOrInitial = split[1].trim();
462
            if (givenNameOrInitial.matches("[A-Za-z]\\.(\\s*[A-Za-z]\\.)*")){
463
                person.setInitials(givenNameOrInitial);
464
            }else{
465
                person.setGivenName(givenNameOrInitial);
466
            }
467
        }
468
        if (split.length >= 3){
469
            person.setSuffix(split[2].trim());
470
        }
471

    
472
        //source
473
        String recordLocation = recordLocation(state, record);
474
        person.addImportSource(null, null, state.getConfig().getSourceReference(), recordLocation);
475

    
476
        return person;
477
    }
478

    
479
    /**
480
     * Returns the single value for the given tag
481
     * and removes the tag from the record.
482
     * If more than 1 value exists this is logged
483
     * as a warning.
484
     */
485
    private RisValue getSingleValue(RisReferenceImportState state,
486
            Map<RisReferenceTag, List<RisValue>> record,
487
            RisReferenceTag tag) {
488
        return getSingleValue(state, record, tag, true);
489
    }
490

    
491
    /**
492
     * Returns the single value for the given tag
493
     * and removes the tag from the record.
494
     * If more than 1 value exists this is logged
495
     * as a warning.
496
     */
497
    private RisValue getSingleValue(RisReferenceImportState state,
498
            Map<RisReferenceTag, List<RisValue>> record,
499
            RisReferenceTag tag, boolean remove) {
500

    
501
        List<RisValue> list = record.get(tag);
502
        if (list == null){
503
            return null;
504
        }
505
        assertSingle(state, list, tag);
506
        if (remove){
507
            record.remove(tag);
508
        }
509
        return list.get(0);
510
    }
511

    
512
    private List<RisValue> getListValue(Map<RisReferenceTag, List<RisValue>> record,
513
            RisReferenceTag tag) {
514

    
515
        List<RisValue> list = record.get(tag);
516
        record.remove(tag);
517
        if (list == null){
518
            list = new ArrayList<>();
519
        }
520
        return list;
521
    }
522

    
523
    private void assertSingle(RisReferenceImportState state, List<RisValue> list, RisReferenceTag tag) {
524
        if (list.size() > 1){
525
            String message = "There is more than 1 tag '%s' but only 1 tag is supported by RIS format or"
526
                    + " by the current import implementation.";
527
            message = String.format(message, tag.name());
528
            state.getResult().addWarning(message, null, list.get(0).location + "ff");
529
        }else if (list.isEmpty()){
530
            state.getResult().addError("A tag list was empty. This should not happen and is a programming code error");
531
        }
532
    }
533

    
534
    private ReferenceType makeReferenceType(RisReferenceImportState state,
535
            Map<RisReferenceTag, List<RisValue>> record) {
536
        RisReferenceTag tyTag = RisReferenceTag.TY;
537
        RisValue value = this.getSingleValue(state, record, tyTag, false);
538
        String typeStr = value.value;
539
        RisRecordType type = RisRecordType.valueOf(typeStr);
540
        ReferenceType cdmType = type.getCdmReferenceType();
541
        return cdmType;
542
    }
543

    
544
    @Override
545
    public ImportDeduplicationHelper createDeduplicationHelper(RisReferenceImportState state){
546
        ImportDeduplicationHelper result = super.createDeduplicationHelper(state);
547
        result.setMaxCountFullLoad(state.getConfig().getDeduplicationMaxCountForFullLoad());
548
        return result;
549
    }
550

    
551
    @Override
552
    protected boolean doCheck(RisReferenceImportState state) {
553
        return true;
554
    }
555

    
556
    @Override
557
    protected boolean isIgnore(RisReferenceImportState state) {
558
        return false;
559
    }
560
}
(3-3/6)