Project

General

Profile

« Previous | Next » 

Revision 13ed824e

Added by Andreas Kohlbecker over 7 years ago

ref #6026 more parser improvements and tests

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
89 89
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
90 90
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969
91 91
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969
92
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
92
            Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
93 93
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
94 94
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
95 95
            Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969
96 96
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
97 97
        };
98
    private static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
98
    protected static final Pattern typeSpecimenSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
99 99

  
100 100
    private static final Pattern typeNameBasionymPattern =  Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$");
101 101
    private static final Pattern typeNameNotePattern =  Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]'
102 102
    private static final Pattern typeNameSpecialSplitPattern =  Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)");
103 103

  
104
    private static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
104
    protected static final Pattern collectorPattern =  Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
105 105
    private static final Pattern collectionDataPattern =  Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$");
106 106
    private static final Pattern collectorsNumber =  Pattern.compile("^([nN]o\\.\\s.*)$");
107 107

  
......
479 479
     * @param state
480 480
     * @return null if the fieldUnitStr could not be parsed
481 481
     */
482
    private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
482
    protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
483 483

  
484 484
        FieldUnit fieldUnit = null;
485 485

  
486 486
        Matcher m1 = collectorPattern.matcher(fieldUnitStr);
487 487
        if(m1.matches()){
488 488

  
489
            String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996)
489
            String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
490 490
            String removal = m1.group(1);
491 491
            if(collectorData == null){
492
                collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996
492
                collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
493 493
                removal = m1.group(3);
494 494
            }
495
            if(collectorData == null){
496
                collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$
497
                removal = null;
498
            }
495 499
            if(collectorData == null){
496 500
                return null;
497 501
            }
498 502

  
499 503
            // the fieldUnitStr is parsable
500 504
            // remove all collectorData from the fieldUnitStr and use the rest as locality
501
            String locality = fieldUnitStr.replace(removal, "");
505
            String locality = null;
506
            if(removal != null){
507
                locality = fieldUnitStr.replace(removal, "");
508
            }
502 509

  
503 510
            String collectorStr = null;
504 511
            String detailStr = null;
......
534 541

  
535 542
            fieldUnit = FieldUnit.NewInstance();
536 543
            GatheringEvent ge = GatheringEvent.NewInstance();
537
            ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
544
            if(locality != null){
545
                ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE()));
546
            }
538 547

  
539 548
            TeamOrPersonBase agent =  state.getAgentBase(collectorStr);
540 549
            if(agent == null) {
app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java
4 4
import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
5 5
import org.junit.Before;
6 6
import org.junit.Test;
7
import org.springframework.util.Assert;
7
import org.junit.Assert;
8

  
9
import java.util.regex.Matcher;
8 10

  
9 11
/**
10 12
 * Created by andreas on 9/15/16.
11 13
 */
12
public class IAPTImportTest {
14
public class IAPTImportTest extends Assert {
13 15

  
14 16
    IAPTExcelImport importer = null;
15 17

  
......
33 35
                "12/04/1969",
34 36
                "12-04-1969",
35 37
                "12 de Enero de 1999",
38
                "17 de dezembro 1997",
39
                "15 diciembre de 1997",
36 40
                "Enero de 1999",
37 41
                "04.1969",
38 42
                "04/1969",
......
42 46
                "12-VI-1969",
43 47
                "12. April 1969",
44 48
                "april 1999",
45
                "22 Dec.1999"
49
                "22 Dec.1999",
46 50
        };
47 51

  
48 52
        for (String d: dateStrings) {
49
            Assert.notNull(importer.parseDate("0", d), "Could not parse " + d);
53
            Assert.assertNotNull("Could not parse " + d, importer.parseDate("0", d));
54
        }
55
    }
56

  
57
    @Test
58
    public void testTypeSpecimenSplit(){
59

  
60
        String[][] typeStrings = new String[][]{
61
                new String[]{
62
                        "Type: Willershausen, ehemalige Ziegelei-Grube am Ostrand der Ortschaft. - Hellgraue, feingeschichtete Mergelsteinknollen, Pliozän.Holotype: STU P 1425.",
63
                        "STU P 1425",
64
                        ""},
65
                new String[]{
66
                        "Type: Armenia, Shirak distr. in vicinitate pag. Areg. m. Arteni in steppis tragacanthaceis, 1500-1700 m s.m. 9.4.1998, E. Gabrielian legitHolotype: ERE 146518. Isotype(s): B 147519-147520, LE 146520.",
67
                        "ERE 146518.",
68
                        "B 147519-147520, LE 146520."}
69
        };
70
        for (String[] t: typeStrings) {
71
            Matcher m = importer.typeSpecimenSplitPattern.matcher(t[0]);
72
            assertTrue("typeSpecimenSplitPattern is not matching: " + t[0], m.matches());
73
            if(!t[1].isEmpty()){
74
                assertEquals(t[1], m.group("holotype").trim());
75
            }
76
            if(!t[2].isEmpty()){
77
                assertEquals(t[2], m.group("isotype").trim());
78
            }
50 79
        }
80

  
51 81
    }
52 82

  
53 83
    @Test
......
76 106

  
77 107
        };
78 108
        for (String t: typeStrings) {
79
            Assert.notNull(importer.parseSpecimenType(fu, IAPTExcelImport.TypesName.holotype, collection, t, "0"), "Could not parse: " + t);
109
            assertNotNull("Could not parse: " + t, importer.parseSpecimenType(fu, IAPTExcelImport.TypesName.holotype, collection, t, "0"));
80 110
        }
81 111

  
82 112
    }
113

  
114
    @Test
115
    public void testParseFieldUnit(){
116

  
117
        String[] typeStrings = new String[]{
118
                "Lake Bungarby, (36°09'S, 149°08'E), south-eastern New South Wales. - leg. Greg Jordan, Graham Taylor & Leanne Dansie.",
119
                "Mt. Koghis, Nouvelle-Calédonie (leg. Moser et al., 06.03.1994).",
120
                "Salt marsh, Wladyslawowo, Puck Bay, Poland (leg. A. Witkowski, 1993).",
121
                "Blankaart, Woumen (Belgium), sediment sample Jun 1993, core III, 16-17 cm depth (leg. L. Denys, January 1997). In sediment and epiphyton.",
122
                "Rivière des Lacs, Cascade (Chutes de la Madeleine), Nouvelle-Calédonie (leg. Moser et al., 10.03.1994).",
123
                "Bulgaria austro-occidentalis. In graminosis saxosis prope vic. Strumesnitza, cca 120 m s.m., Petric district. Leg. D.Delipavlov 03.06.1987.",
124
                "Lesbos 152, mit sechs Schliffen, ein kleines Geröll mit einem Durchmesser von ca. 4,5 x 5,5 cm. - Strand von Lapsarna, nordwestlich von Antissa. Versteinerter Wald von Lesbos, Griechenland. - Tertiär, Oberoligozän/Untermiozän. - Leg.: E. Velitzelo",
125
                "Haute Hienghène, au nord-est de l'île Nouvelle-Calédonie. Expression de mousses (leg. Guillaumin).",
126
                "leg. J. J. Halda 18.3.1997"
127
        };
128
        for (String t: typeStrings) {
129
            assertTrue("collectorPattern is not matching: " + t, importer.collectorPattern.matcher(t).matches());
130
        }
131
    }
83 132
}

Also available in: Unified diff