Project

General

Profile

« Previous | Next » 

Revision 1968849d

Added by Andreas Kohlbecker over 7 years ago

ref #6026 better type parsing

  • FieldUnit: collector, fieldnumber, date
  • Specimen: Collection, Subcollection, accession number Does not work in all cases

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/app/iapt/IAPTActivator.java
39 39
    public static final String DATA_FILE_FULL = "Registration_DB_from_BGBM17.xls";
40 40
    public static final String DATA_ENCODING_PROBLEMS = "encoding-problems.xls";
41 41
    public static final String DATA_IAPT_TYPES_100 = "iapt-types-100.xls";
42
    public static final String DATA_FILE = DATA_FILE_FULL;
42
    public static final String DATA_TYPE_LEG_100 = "iapt-type-leg-100.xls";
43
    public static final String DATA_FILE = DATA_TYPE_LEG_100;   
43 44

  
44 45
    //database validation status (create, update, validate ...)
45 46
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
14 14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImport;
15 15
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
16 16
import eu.etaxonomy.cdm.model.agent.Institution;
17
import eu.etaxonomy.cdm.model.agent.Person;
18
import eu.etaxonomy.cdm.model.agent.TeamOrPersonBase;
17 19
import eu.etaxonomy.cdm.model.common.*;
18 20
import eu.etaxonomy.cdm.model.name.*;
19 21
import eu.etaxonomy.cdm.model.occurrence.*;
......
73 75
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
74 76

  
75 77
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
76
    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
78
    private static final Pattern[] datePatterns = new Pattern[]{
77 79
            // NOTE:
78 80
            // The order of the patterns is extremely important!!!
79 81
            //
80 82
            // all patterns cover the years 1700 - 1999
81 83
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
82 84
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
83
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
84
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
85
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
86
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
85 87
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
86
            Pattern.compile("^(?:(?<day>[0-9]{1,2})\\sde\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
88
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
87 89
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
88 90
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
89
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
91
            Pattern.compile("^(?<month>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
92
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
90 93
        };
91
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
94
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
95

  
96
    private static final Pattern collectorPattern =  Pattern.compile(".*?\\(leg\\.\\s+([^\\)]*)\\)|.*?\\sleg\\.\\s+(.*?)\\.?$");
97
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
98
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
92 99

  
93 100
    // AccessionNumbers: , #.*, n°:?, 96/3293, No..*, -?\w{1,3}-[0-9\-/]*
94 101
    private static final Pattern accessionNumberOnlyPattern = Pattern.compile("^(?<accNumber>(?:n°\\:?\\s?|#|No\\.?\\s?)?[\\d\\w\\-/]*)$");
......
96 103
    private static final Pattern[] specimenTypePatterns = new Pattern[]{
97 104
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:\\((?<institute>.*[^\\)])\\))(?<accNumber>.*)?$"), // like: GAUF (Gansu Agricultural University) No. 1207-1222
98 105
            Pattern.compile("^(?<colCode>[A-Z]+|CPC Micropaleontology Lab\\.?)\\s+(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<accNumber>.*)?$"), // like KASSEL Coll. Krasske, Praep. DII 78
99
            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*)\\2(?<accNumber>.*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
106
            Pattern.compile("^(?:Coll\\.\\s(?<subCollection>[^\\.,;]*)(.))(?<institute>.*?)(?<accNumber>Praep\\..*)?$"), // like Coll. Lange-Bertalot, Bot. Inst., Univ. Frankfurt/Main, Germany Praep. Neukaledonien OTL 62
100 107
            Pattern.compile("^(?<colCode>[A-Z]+)(?:\\s+(?<accNumber>.*))?$"), // identifies the Collection code and takes the rest as accessionNumber if any
101 108
    };
102 109

  
......
135 142

  
136 143

  
137 144
    enum TypesName {
138
        type, holotype, isotype;
145
        fieldUnit, holotype, isotype;
139 146

  
140 147
        public SpecimenTypeDesignationStatus status(){
141 148
            switch (this) {
......
187 194
                nomRefDetail = m.group(2);
188 195
                nomRefPupDate = m.group(3).trim();
189 196

  
190
                pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate);
197
                pupDate = parseDate(regNumber, nomRefPupDate);
191 198
                if (pupDate != null) {
192 199
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
200
                } else {
201
                    logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
193 202
                }
194 203
            } else {
195 204
                nomRefTitle = nomRefStr;
......
263 272

  
264 273
        // Types
265 274
        if(!StringUtils.isEmpty(typeStr)){
266
            makeTypeData(typeStr, taxonName, regNumber);
275
            makeTypeData(typeStr, taxonName, regNumber, state);
267 276
        }
268 277

  
269 278
        getTaxonService().save(taxon);
......
276 285

  
277 286
    }
278 287

  
279
    private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber) {
288
    private void makeTypeData(String typeStr, BotanicalName taxonName, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
280 289

  
281 290
        Matcher m = typeSplitPattern.matcher(typeStr);
282 291

  
283 292
        if(m.matches()){
284
            String typeString = m.group(TypesName.type.name());
285
            boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km
286

  
287
            if(isFieldUnit) {
288
                // type as fieldUnit
289
                FieldUnit fu = FieldUnit.NewInstance();
290
                fu.setTitleCache(typeString, true);
291
                getOccurrenceService().save(fu);
292

  
293
                // all others ..
294
                addSpecimenTypes(taxonName, fu, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
295
                addSpecimenTypes(taxonName, fu, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
296
            } else {
297
                TaxonNameBase typeName = nameParser.parseFullName(typeString);
298
                taxonName.addNameTypeDesignation(typeName, null, null, null, NameTypeDesignationStatus.AUTOMATIC(), true, true, true, true);
293
            String fieldUnitStr = m.group(TypesName.fieldUnit.name());
294
            // boolean isFieldUnit = typeStr.matches(".*([°']|\\d+\\s?m\\s|\\d+\\s?km\\s).*"); // check for location or unit m, km // makes no sense!!!!
295
            FieldUnit fieldUnit = parseFieldUnit(fieldUnitStr, regNumber, state);
296
            if(fieldUnit == null) {
297
                // create a field unit with only a titleCache using the fieldUnitStr substring
298
                fieldUnit = FieldUnit.NewInstance();
299
                fieldUnit.setTitleCache(fieldUnitStr, true);
300
                getOccurrenceService().save(fieldUnit);
299 301
            }
302
            getOccurrenceService().save(fieldUnit);
303

  
304
            // all others ..
305
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.holotype.name()), TypesName.holotype, false, regNumber);
306
            addSpecimenTypes(taxonName, fieldUnit, m.group(TypesName.isotype.name()), TypesName.isotype, true, regNumber);
307

  
308
        } else {
309
            // create a field unit with only a titleCache using the full typeStr
310
            FieldUnit fieldUnit = FieldUnit.NewInstance();
311
            fieldUnit.setTitleCache(typeStr, true);
312
            getOccurrenceService().save(fieldUnit);
313
            logger.warn(csvReportLine(regNumber, "Type field can not be parsed", typeStr));
300 314
        }
301 315
        getNameService().save(taxonName);
302 316
    }
303 317

  
304
    private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
318
    /**
319
     * Currently only parses the collector, fieldNumber and the collection date.
320
     *
321
     * @param fieldUnitStr
322
     * @param regNumber
323
     * @param state
324
     * @return null if the fieldUnitStr could not be parsed
325
     */
326
    private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
327

  
328
        FieldUnit fieldUnit = null;
329

  
330
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
331
        if(m1.matches()){
332
            String collectionData = m1.group(1); // like (leg. Metzeltin, 30. 9. 1996)
333
            if(collectionData == null){
334
                collectionData = m1.group(2); // like leg. Metzeltin, 30. 9. 1996
335
            }
336
            if(collectionData == null){
337
                return null;
338
            }
339

  
340
            String collectorStr = null;
341
            String detailStr = null;
342
            Partial date = null;
343
            String fieldNumber = null;
344

  
345
            Matcher m2 = collectionDataPattern.matcher(collectionData);
346
            if(m2.matches()){
347
                collectorStr = m2.group("collector");
348
                detailStr = m2.group("detail");
349

  
350
                // Try to make sense of the detailStr
351
                if(detailStr != null){
352
                    detailStr = detailStr.trim();
353
                    // 1. try to parse as date
354
                    date = parseDate(regNumber, detailStr);
355
                    if(date == null){
356
                        // 2. try to parse as number
357
                        if(collectorsNumber.matcher(detailStr).matches()){
358
                            fieldNumber = detailStr;
359
                        }
360
                    }
361
                }
362
                if(date == null && fieldNumber == null){
363
                    // detailed parsing not possible, so need fo fallback
364
                    collectorStr = collectionData;
365
                }
366
            }
367

  
368
            if(collectorStr != null) {
369
                fieldUnit = FieldUnit.NewInstance();
370
                GatheringEvent ge = GatheringEvent.NewInstance();
371

  
372
                TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
373
                if(agent == null) {
374
                    agent = Person.NewTitledInstance(collectorStr);
375
                    getAgentService().save(agent);
376
                    state.putAgentBase(collectorStr, agent);
377
                }
378
                ge.setCollector(agent);
379

  
380
                if(date != null){
381
                    ge.setGatheringDate(date);
382
                }
383

  
384
                getEventBaseService().save(ge);
385
                fieldUnit.setGatheringEvent(ge);
386

  
387
                if(fieldNumber != null) {
388
                    fieldUnit.setFieldNumber(fieldNumber);
389
                }
390
                getOccurrenceService().save(fieldUnit);
391
            }
392
        }
393

  
394
        return fieldUnit;
395
    }
396

  
397
    private Partial parseDate(String regNumber, String dateStr) {
305 398

  
306 399
        Partial pupDate = null;
307 400
        boolean parseError = false;
308
        String nomRefPupDay = null;
309
        String nomRefPupMonth = null;
310
        String nomRefPupMonthName = null;
311
        String nomRefPupYear = null;
312 401

  
402
        String day = null;
403
        String month = null;
404
        String monthName = null;
405
        String year = null;
313 406

  
314
        // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
315
        for(Pattern p : nomRefPubDatePs){
316
            Matcher m2 = p.matcher(nomRefPupDate);
407
        for(Pattern p : datePatterns){
408
            Matcher m2 = p.matcher(dateStr);
317 409
            if(m2.matches()){
318 410
                try {
319
                    nomRefPupYear = m2.group("year");
411
                    year = m2.group("year");
320 412
                } catch (IllegalArgumentException e){
321 413
                    // named capture group not found
322 414
                }
323 415
                try {
324
                    nomRefPupMonth = m2.group("month");
416
                    month = m2.group("month");
325 417
                } catch (IllegalArgumentException e){
326 418
                    // named capture group not found
327 419
                }
420

  
328 421
                try {
329
                    nomRefPupMonthName = m2.group("monthName");
330
                    nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
331
                    if(nomRefPupMonth == null){
422
                    monthName = m2.group("monthName");
423
                    month = monthFromName(monthName, regNumber);
424
                    if(month == null){
332 425
                        parseError = true;
333 426
                    }
334 427
                } catch (IllegalArgumentException e){
335 428
                    // named capture group not found
336 429
                }
337 430
                try {
338
                    nomRefPupDay = m2.group("day");
431
                    day = m2.group("day");
339 432
                } catch (IllegalArgumentException e){
340 433
                    // named capture group not found
341 434
                }
342 435

  
343
                if(nomRefPupYear == null){
344
                    logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
436
                if(year != null){
437
                    if (year.length() == 2) {
438
                        // it is an abbreviated year from the 19** years
439
                        year = "19" + year;
440
                    }
441
                    break;
442
                } else {
345 443
                    parseError = true;
346 444
                }
347
                if(nomRefPupYear.length() == 2 ){
348
                    // it is an abbreviated year from the 19** years
349
                    nomRefPupYear = "19" + nomRefPupYear;
350
                }
351

  
352
                break;
353 445
            }
354 446
        }
355
        if(nomRefPupYear == null){
356
            logger.warn(csvReportLine(regNumber, "Pub date", nomRefPupDate, "in", nomRefStr, "not parsable"));
447
        if(year == null){
357 448
            parseError = true;
358 449
        }
359 450
        List<DateTimeFieldType> types = new ArrayList<>();
360 451
        List<Integer> values = new ArrayList<>();
361 452
        if(!parseError) {
362 453
            types.add(DateTimeFieldType.year());
363
            values.add(Integer.parseInt(nomRefPupYear));
364
            if (nomRefPupMonth != null) {
454
            values.add(Integer.parseInt(year));
455
            if (month != null) {
365 456
                types.add(DateTimeFieldType.monthOfYear());
366
                values.add(Integer.parseInt(nomRefPupMonth));
457
                values.add(Integer.parseInt(month));
367 458
            }
368
            if (nomRefPupDay != null) {
459
            if (day != null) {
369 460
                types.add(DateTimeFieldType.dayOfMonth());
370
                values.add(Integer.parseInt(nomRefPupDay));
461
                values.add(Integer.parseInt(day));
371 462
            }
372 463
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
373 464
        }
......
484 575
                    try {
485 576
                        collectionCode = m.group("colCode");
486 577
                    } catch (IllegalArgumentException e){
487
                        logger.warn(csvReportLine(regNumber, "match group colCode not found"));
488
                        continue;
578
                        // match group colCode not found
489 579
                    }
490 580
                    try {
491 581
                        subCollectionStr = m.group("subCollection");
......
523 613
                        // match group acc_number not found
524 614
                    }
525 615

  
616
                    if(collectionCode == null && instituteStr == null){
617
                        logger.warn(csvReportLine(regNumber, "neither 'collectionCode' nor 'institute' found in ", text));
618
                        continue;
619
                    }
526 620
                    collection = getCollection(collectionCode, instituteStr, subCollectionStr);
527 621
                    specimen = makeSpecimenType(fieldUnit, collection, accessionNumber);
528 622
                    break;
......
530 624
            }
531 625
        }
532 626
        if(specimen == null) {
533
            logger.warn(csvReportLine(regNumber, "Could not parse specimen type", typeName.name().toString(), text));
627
            logger.warn(csvReportLine(regNumber, "Could not parse specimen fieldUnit", typeName.name().toString(), text));
534 628
        }
535 629
        if(unusualAccessionNumber){
536 630
            logger.warn(csvReportLine(regNumber, "Unusual accession number", typeName.name().toString(), text, accessionNumber));
......
542 636

  
543 637
        DerivedUnitFacade facade = DerivedUnitFacade.NewInstance(SpecimenOrObservationType.PreservedSpecimen, fieldUnit);
544 638
        facade.setCollection(collection);
545
        facade.setAccessionNumber(accessionNumber);
639
        if(accessionNumber != null){
640
            facade.setAccessionNumber(accessionNumber);
641
        }
546 642
        return facade.innerDerivedUnit();
547 643
    }
548 644

  

Also available in: Unified diff