Project

General

Profile

« Previous | Next » 

Revision adff9167

Added by Andreas Kohlbecker over 7 years ago

ref #6026 type parsing

  • parsing of specimen types

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/app/iapt/IAPTActivator.java
38 38
    public static final String DATA_FILE_0_100 = "iapt-100.xls";
39 39
    public static final String DATA_FILE_FULL = "Registration_DB_from_BGBM17.xls";
40 40
    public static final String DATA_ENCODING_PROBLEMS = "encoding-problems.xls";
41
    public static final String DATA_FILE = DATA_FILE_0_100;
41
    public static final String DATA_IAPT_TYPES_100 = "iapt-types-100.xls";
42
    public static final String DATA_FILE = DATA_FILE_FULL;
42 43

  
43 44
    //database validation status (create, update, validate ...)
44 45
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
13 13
import eu.etaxonomy.cdm.common.CdmUtils;
14 14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15 15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16
import eu.etaxonomy.cdm.model.agent.Institution;
16 17
import eu.etaxonomy.cdm.model.common.*;
17 18
import eu.etaxonomy.cdm.model.name.*;
18
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
19
import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
20
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationType;
19
import eu.etaxonomy.cdm.model.occurrence.*;
20
import eu.etaxonomy.cdm.model.occurrence.Collection;
21 21
import eu.etaxonomy.cdm.model.reference.Reference;
22 22
import eu.etaxonomy.cdm.model.taxon.*;
23 23
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
......
86 86
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
87 87
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
88 88
        };
89
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
89
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
90

  
91
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
92
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
93

  
94
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
95
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
96
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
97
            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*)\\2(?<accNumber>.*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
98
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
99
    };
90 100

  
91 101
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
102

  
92 103
    static {
93 104
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
94 105
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
......
115 126
        monthFromNameMap.put("Februari", 2);
116 127
    }
117 128

  
129

  
118 130
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
119 131

  
132
    private Map<String, Collection> collectionMap = new HashMap<>();
133

  
134

  
120 135
    enum TypesName {
121 136
        type, holotype, isotype;
122 137

  
......
179 194
            }
180 195
        }
181 196

  
182
        BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle);
197
        BotanicalName taxonName = makeBotanicalName(state, regNumber, titleCacheStr, nameStr, authorStr, nomRefTitle);
183 198

  
184 199
        // always add the original strings of parsed data as annotation
185 200
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
......
231 246
        // Replaced Synonyms
232 247
        if(!StringUtils.isEmpty(fullSynSubstStr)){
233 248
            fullSynSubstStr = fullSynSubstStr.replace("Syn. subst.: ", "");
234
            BotanicalName replacedSynonymName = makeBotanicalName(state, fullSynSubstStr, synSubstStr, null, null);
249
            BotanicalName replacedSynonymName = makeBotanicalName(state, regNumber, fullSynSubstStr, synSubstStr, null, null);
235 250
            replacedSynonymName.addReplacedSynonym(taxonName, null, null, null);
236 251
            getNameService().save(replacedSynonymName);
237 252
        }
......
246 261

  
247 262
        // Types
248 263
        if(!StringUtils.isEmpty(typeStr)){
249
            Matcher m = typeSplitPattern.matcher(typeStr);
250

  
251
            if(m.matches()){
252
                String typeString = m.group(TypesName.type.name());
253
                boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
254

  
255
                if(isFieldUnit) {
256
                    // type as fieldUnit
257
                    FieldUnit fu = FieldUnit.NewInstance();
258
                    fu.setTitleCache(typeString, true);
259
                    getOccurrenceService().save(fu);
260

  
261
                    // all others ..
262
                    addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false);
263
                    addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true);
264
                } else {
265
                    TaxonNameBase typeName = nameParser.parseFullName(typeString);
266
                    taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
267
                }
268
            }
269
            getNameService().save(taxonName);
270

  
264
            makeTypeData(typeStr, taxonName, regNumber);
271 265
        }
272 266

  
273 267
        getTaxonService().save(taxon);
......
280 274

  
281 275
    }
282 276

  
277
    private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber) {
278

  
279
        Matcher m = typeSplitPattern.matcher(typeStr);
280

  
281
        if(m.matches()){
282
            String typeString = m.group(TypesName.type.name());
283
            boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
284

  
285
            if(isFieldUnit) {
286
                // type as fieldUnit
287
                FieldUnit fu = FieldUnit.NewInstance();
288
                fu.setTitleCache(typeString, true);
289
                getOccurrenceService().save(fu);
290

  
291
                // all others ..
292
                addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
293
                addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
294
            } else {
295
                TaxonNameBase typeName = nameParser.parseFullName(typeString);
296
                taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
297
            }
298
        }
299
        getNameService().save(taxonName);
300
    }
301

  
283 302
    private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
284 303

  
285 304
        Partial pupDate = null;
......
332 351
            }
333 352
        }
334 353
        if(nomRefPupYear == null){
335
            logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
354
            logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
336 355
            parseError = true;
337 356
        }
338 357
        List<DateTimeFieldType> types = new ArrayList<>();
......
357 376

  
358 377
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
359 378
        if(month == null){
360
            logger.warn("Unknown month [" + regNumber + "]: " + monthName + " (" + monthName.toLowerCase() + ")");
379
            logger.warn(csvReportLine(regNumber, "Unknown month name", monthName));
361 380
            return null;
362 381
        } else {
363 382
            return month.toString();
......
365 384
    }
366 385

  
367 386

  
368
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple){
387
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple, String regNumber){
388

  
369 389
        if(StringUtils.isEmpty(typeStr)){
370 390
            return;
371 391
        }
372 392
        typeStr = typeStr.trim().replaceAll("\\.$", "");
373 393

  
374
        List<String> typeData = new ArrayList<>();
394
        Collection collection = null;
395
        DerivedUnit specimen = null;
396

  
397
        List<DerivedUnit> specimens = new ArrayList<>();
375 398
        if(multiple){
376 399
            String[] tokens = typeStr.split("\\s?,\\s?");
377 400
            for (String t : tokens) {
401
                // command to  list all complex parsabel types:
402
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Holotype:\s([A-Z]*\s)[^.]*?'
403
                // csvcut -t -c RegistrationNo_Pk,Type iapt.csv | csvgrep -c Type -m "Holotype" | egrep -o 'Isotype[^:]*:\s([A-Z]*\s)[^.]*?'
404

  
378 405
                if(!t.isEmpty()){
379
                    typeData.add(t.trim());
406
                    // trying to parse the string
407
                    specimen = parseSpecimenType(fieldUnit, typeName, collection, t, regNumber);
408
                    if(specimen != null){
409
                        specimens.add(specimen);
410
                    } else {
411
                        // parsing was not successful make simple specimen
412
                        specimens.add(makeSpecimenType(fieldUnit, t));
413
                    }
380 414
                }
381 415
            }
382 416
        } else {
383
            typeData.add(typeStr.trim());
417
            specimen = parseSpecimenType(fieldUnit, typeName, collection, typeStr, regNumber);
418
            if(specimen != null) {
419
                specimens.add(specimen);
420
                // remember current collection
421
                collection = specimen.getCollection();
422
            } else {
423
                // parsing was not successful make simple specimen
424
                specimens.add(makeSpecimenType(fieldUnit, typeStr));
425
            }
384 426
        }
385 427

  
386
        for(String type : typeData){
387
            DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.OtherSpecimen, fieldUnit);
388
            facade.setTitleCache(type, true);
389
            DerivedUnit specimen = facade.innerDerivedUnit();
390
            taxonName.addSpecimenTypeDesignation(specimen, typeName.status(), null, null, null, false, true);
428
        for(DerivedUnit s : specimens){
429
            taxonName.addSpecimenTypeDesignation(s, typeName.status(), null, null, null, false, true);
391 430
       }
392 431
    }
393 432

  
394
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr,
433
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, String titleCache) {
434
        DerivedUnit specimen;DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
435
        facade.setTitleCache(titleCache.trim(), true);
436
        specimen = facade.innerDerivedUnit();
437
        return specimen;
438
    }
439

  
440
    /**
441
     *
442
     * @param fieldUnit
443
     * @param typeName
444
     * @param collection
445
     * @param text
446
     * @param regNumber
447
     * @return
448
     */
449
    private DerivedUnit parseSpecimenType(FieldUnit fieldUnit, TypesName typeName, Collection collection, String text, String regNumber) {
450

  
451
        DerivedUnit specimen = null;
452

  
453
        String collectionCode = null;
454
        String subCollectionStr = null;
455
        String instituteStr = null;
456
        String accessionNumber = null;
457

  
458
        boolean unusualAccessionNumber = false;
459

  
460
        text = text.trim();
461

  
462
        // 1.  For Isotypes often the accession number is noted alone if the
463
        //     preceeding entry has a collection code.
464
        if(typeName .equals(TypesName.isotype) && collection != null){
465
            Matcher m = accessionNumberOnlyPattern.matcher(text);
466
            if(m.matches()){
467
                try {
468
                    accessionNumber = m.group("accNumber");
469
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
470
                } catch (IllegalArgumentException e){
471
                    // match group acc_number not found
472
                }
473
            }
474
        }
475

  
476
        //2. try it the 'normal' way
477
        if(specimen == null) {
478
            for (Pattern p : specimenTypePatterns) {
479
                Matcher m = p.matcher(text);
480
                if (m.matches()) {
481
                    // collection code is mandatory
482
                    try {
483
                        collectionCode = m.group("colCode");
484
                    } catch (IllegalArgumentException e){
485
                        logger.warn(csvReportLine(regNumber, "match group colCode not found"));
486
                        continue;
487
                    }
488
                    try {
489
                        subCollectionStr = m.group("subCollection");
490
                    } catch (IllegalArgumentException e){
491
                        // match group subCollection not found
492
                    }
493
                    try {
494
                        instituteStr = m.group("institute");
495
                    } catch (IllegalArgumentException e){
496
                        // match group col_name not found
497
                    }
498
                    try {
499
                        accessionNumber = m.group("accNumber");
500

  
501
                        // try to improve the accessionNumber
502
                        if(accessionNumber!= null) {
503
                            accessionNumber = accessionNumber.trim();
504
                            Matcher m2 = accessionNumberOnlyPattern.matcher(accessionNumber);
505
                            String betterAccessionNumber = null;
506
                            if (m2.matches()) {
507
                                try {
508
                                    betterAccessionNumber = m.group("accNumber");
509
                                } catch (IllegalArgumentException e) {
510
                                    // match group acc_number not found
511
                                }
512
                            }
513
                            if (betterAccessionNumber != null) {
514
                                accessionNumber = betterAccessionNumber;
515
                            } else {
516
                                unusualAccessionNumber = true;
517
                            }
518
                        }
519

  
520
                    } catch (IllegalArgumentException e){
521
                        // match group acc_number not found
522
                    }
523

  
524
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
525
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
526
                    break;
527
                }
528
            }
529
        }
530
        if(specimen == null) {
531
            logger.warn(csvReportLine(regNumber, "Could not parse specimen type", typeName.name().toString(), text));
532
        }
533
        if(unusualAccessionNumber){
534
            logger.warn(csvReportLine(regNumber, "Unusual accession number", typeName.name().toString(), text, accessionNumber));
535
        }
536
        return specimen;
537
    }
538

  
539
    private DerivedUnit makeSpecimenType(FieldUnit fieldUnit, Collection collection, String accessionNumber) {
540

  
541
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
542
        facade.setCollection(collection);
543
        facade.setAccessionNumber(accessionNumber);
544
        return facade.innerDerivedUnit();
545
    }
546

  
547
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String regNumber, String titleCacheStr, String nameStr,
395 548
                                            String authorStr, String nomRefTitle) {
396 549

  
397 550
        BotanicalName taxonName;// cache field for the taxonName.titleCache
398 551
        String taxonNameTitleCache = null;
399 552
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
400 553

  
401
        String line = state.getCurrentLine() + ": ";
402

  
403 554
        // TitleCache preprocessing
404 555
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) || (authorStr != null && authorStr.endsWith(ANNOTATION_MARKER_STRING))){
405 556
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
......
419 570

  
420 571
        taxonNameTitleCache = taxonName.getTitleCache().trim();
421 572
        if (taxonName.isProtectedTitleCache()) {
422
            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
573
            logger.warn(csvReportLine(regNumber, "Name could not be parsed", titleCacheStr));
423 574
        } else {
424 575

  
425 576
            boolean doRestoreTitleCacheStr = false;
......
440 591
                titleCacheCompareStr = titleCacheCompareStr.replaceAll(" et ", " & ");
441 592
            }
442 593
            if (!taxonNameTitleCache.equals(titleCacheCompareStr)) {
443
                logger.warn(line + "The generated titleCache differs from the imported string : " + taxonNameTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
594
                logger.warn(csvReportLine(regNumber, "The generated titleCache differs from the imported string", taxonNameTitleCache, " != ", titleCacheStr, " ==> original titleCacheStr has been restored"));
444 595
                doRestoreTitleCacheStr = true;
445 596
            }
446 597
            if (!nameCache.trim().equals(nameCompareStr)) {
447
                logger.warn(line + "The parsed nameCache differs from " + NAMESTRING + " : " + nameCache + " <> " + nameCompareStr);
598
                logger.warn(csvReportLine(regNumber, "The parsed nameCache differs from field '" + NAMESTRING + "'", nameCache, " != ", nameCompareStr));
448 599
            }
449 600

  
450 601
            //  Author
451 602
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
452 603
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
453
            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
604
            //    logger.warn(regNumber + ": titleCache has changed after setting authors, will restore original titleCacheStr");
454 605
            //    doRestoreTitleCacheStr = true;
455 606
            //}
456 607

  
......
508 659
        return rootNode;
509 660
    }
510 661

  
662
    private Collection getCollection(String collectionCode, String instituteStr, String subCollectionStr){
663

  
664
        Collection superCollection = null;
665
        if(subCollectionStr != null){
666
            superCollection = getCollection(collectionCode, instituteStr, null);
667
            collectionCode = subCollectionStr;
668
            instituteStr = null;
669
        }
670

  
671
        final String key = collectionCode + "-#i:" + StringUtils.defaultString(instituteStr);
672

  
673
        Collection collection = collectionMap.get(key);
674

  
675
        if(collection == null) {
676
            collection = Collection.NewInstance();
677
            collection.setCode(collectionCode);
678
            if(instituteStr != null){
679
                collection.setInstitute(Institution.NewNamedInstance(instituteStr));
680
            }
681
            if(superCollection != null){
682
                collection.setSuperCollection(superCollection);
683
            }
684
            collectionMap.put(key, collection);
685
            getCollectionService().save(collection);
686
        }
687

  
688
        return collection;
689
    }
690

  
511 691

  
512 692
    /**
513 693
     * @param record
......
592 772
	@Override
593 773
    protected void firstPass(SimpleExcelTaxonImportState<CONFIG> state) {
594 774

  
595
        String lineNumber = state.getCurrentLine() + ": ";
775
        String lineNumber = "L#" + state.getCurrentLine() + ": ";
596 776
        logger.setLevel(Level.DEBUG);
597 777
        HashMap<String, String> record = state.getOriginalRecord();
598 778
        logger.debug(lineNumber + record.toString());
......
753 933
        return markerTypeFossil;
754 934
    }
755 935

  
936
    private String csvReportLine(String regId, String message, String ... fields){
937
        StringBuilder out = new StringBuilder("regID#");
938
        out.append(regId).append(",\"").append(message).append('"');
939

  
940
        for(String f : fields){
941
            out.append(",\"").append(f).append('"');
942
        }
943
        return out.toString();
944
    }
945

  
756 946

  
757 947
}

Also available in: Unified diff