Revision 13ed824e
Added by Andreas Kohlbecker over 7 years ago
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java | ||
---|---|---|
89 | 89 |
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12 |
90 | 90 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(\\s?)(?<month>[0-1]?[0-9])\\2\\3(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12. 04. 1969 or 12/04/1969 or 12-04-1969 |
91 | 91 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<monthName>[IVX]{1,2})\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12-VI-1969 |
92 |
Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)\\s)?(?<monthName>\\p{L}+)\\sde\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
|
|
92 |
Pattern.compile("^(?:(?<day>[0-9]{1,2})(?:\\sde)?\\s)?(?<monthName>\\p{L}+)(?:\\sde)?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full and partial date like 12 de Enero de 1999 or Enero de 1999
|
|
93 | 93 |
Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969 |
94 | 94 |
Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04 |
95 | 95 |
Pattern.compile("^(?<monthName>[IVX]{1,2})([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like VI-1969 |
96 | 96 |
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd|st)?\\s(?<monthName>\\p{L}+\\.?),?\\s?(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999 |
97 | 97 |
}; |
98 |
private static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype.*?[:\\(](?<isotype>.*)\\.?)?\\.?$");
|
|
98 |
protected static final Pattern typeSpecimenSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<fieldUnit>.*?))(?:[Hh]olotype:(?<holotype>.*?)\\.?)?(?:[Ii]sotype[^:]*:(?<isotype>.*)\\.?)?\\.?$");
|
|
99 | 99 |
|
100 | 100 |
private static final Pattern typeNameBasionymPattern = Pattern.compile("\\([Bb]asionym\\s?\\:\\s?(?<basionymName>[^\\)]*).*$"); |
101 | 101 |
private static final Pattern typeNameNotePattern = Pattern.compile("\\[([^\\[]*)"); // matches the inner of '[...]' |
102 | 102 |
private static final Pattern typeNameSpecialSplitPattern = Pattern.compile("(?<note>.*\\;.*?)\\:(?<agent>)\\;(<name>.*)"); |
103 | 103 |
|
104 |
private static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\(leg\\.\\s+(?<data1>[^\\)]*)\\))|.*?(?<fullStr2>\\sleg\\.\\s+(?<data2>.*?)\\.?)$");
|
|
104 |
protected static final Pattern collectorPattern = Pattern.compile(".*?(?<fullStr1>\\([Ll]eg\\.\\s+(?<data1>[^\\)]*)\\)).*$|.*?(?<fullStr2>\\s[Ll]eg\\.\\:?\\s+(?<data2>.*?)\\.?)$|^(?<fullStr3>[Ll]eg\\.\\:?\\s+(?<data3>.*?)\\.?)");
|
|
105 | 105 |
private static final Pattern collectionDataPattern = Pattern.compile("^(?<collector>[^,]*),\\s?(?<detail>.*?)\\.?$"); |
106 | 106 |
private static final Pattern collectorsNumber = Pattern.compile("^([nN]o\\.\\s.*)$"); |
107 | 107 |
|
... | ... | |
479 | 479 |
* @param state |
480 | 480 |
* @return null if the fieldUnitStr could not be parsed |
481 | 481 |
*/ |
482 |
private FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
|
|
482 |
protected FieldUnit parseFieldUnit(String fieldUnitStr, String regNumber, SimpleExcelTaxonImportState<CONFIG> state) {
|
|
483 | 483 |
|
484 | 484 |
FieldUnit fieldUnit = null; |
485 | 485 |
|
486 | 486 |
Matcher m1 = collectorPattern.matcher(fieldUnitStr); |
487 | 487 |
if(m1.matches()){ |
488 | 488 |
|
489 |
String collectorData = m1.group(2); // like (leg. Metzeltin, 30. 9. 1996) |
|
489 |
String collectorData = m1.group(2); // like ... (leg. Metzeltin, 30. 9. 1996)
|
|
490 | 490 |
String removal = m1.group(1); |
491 | 491 |
if(collectorData == null){ |
492 |
collectorData = m1.group(4); // like leg. Metzeltin, 30. 9. 1996 |
|
492 |
collectorData = m1.group(4); // like ... leg. Metzeltin, 30. 9. 1996
|
|
493 | 493 |
removal = m1.group(3); |
494 | 494 |
} |
495 |
if(collectorData == null){ |
|
496 |
collectorData = m1.group(6); // like ^leg. J. J. Halda 18.3.1997$ |
|
497 |
removal = null; |
|
498 |
} |
|
495 | 499 |
if(collectorData == null){ |
496 | 500 |
return null; |
497 | 501 |
} |
498 | 502 |
|
499 | 503 |
// the fieldUnitStr is parsable |
500 | 504 |
// remove all collectorData from the fieldUnitStr and use the rest as locality |
501 |
String locality = fieldUnitStr.replace(removal, ""); |
|
505 |
String locality = null; |
|
506 |
if(removal != null){ |
|
507 |
locality = fieldUnitStr.replace(removal, ""); |
|
508 |
} |
|
502 | 509 |
|
503 | 510 |
String collectorStr = null; |
504 | 511 |
String detailStr = null; |
... | ... | |
534 | 541 |
|
535 | 542 |
fieldUnit = FieldUnit.NewInstance(); |
536 | 543 |
GatheringEvent ge = GatheringEvent.NewInstance(); |
537 |
ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE())); |
|
544 |
if(locality != null){ |
|
545 |
ge.setLocality(LanguageString.NewInstance(locality, Language.UNKNOWN_LANGUAGE())); |
|
546 |
} |
|
538 | 547 |
|
539 | 548 |
TeamOrPersonBase agent = state.getAgentBase(collectorStr); |
540 | 549 |
if(agent == null) { |
app-import/src/test/java/eu/etaxonomy/cdm/io/iapt/IAPTImportTest.java | ||
---|---|---|
4 | 4 |
import eu.etaxonomy.cdm.model.occurrence.FieldUnit; |
5 | 5 |
import org.junit.Before; |
6 | 6 |
import org.junit.Test; |
7 |
import org.springframework.util.Assert; |
|
7 |
import org.junit.Assert; |
|
8 |
|
|
9 |
import java.util.regex.Matcher; |
|
8 | 10 |
|
9 | 11 |
/** |
10 | 12 |
* Created by andreas on 9/15/16. |
11 | 13 |
*/ |
12 |
public class IAPTImportTest { |
|
14 |
public class IAPTImportTest extends Assert {
|
|
13 | 15 |
|
14 | 16 |
IAPTExcelImport importer = null; |
15 | 17 |
|
... | ... | |
33 | 35 |
"12/04/1969", |
34 | 36 |
"12-04-1969", |
35 | 37 |
"12 de Enero de 1999", |
38 |
"17 de dezembro 1997", |
|
39 |
"15 diciembre de 1997", |
|
36 | 40 |
"Enero de 1999", |
37 | 41 |
"04.1969", |
38 | 42 |
"04/1969", |
... | ... | |
42 | 46 |
"12-VI-1969", |
43 | 47 |
"12. April 1969", |
44 | 48 |
"april 1999", |
45 |
"22 Dec.1999" |
|
49 |
"22 Dec.1999",
|
|
46 | 50 |
}; |
47 | 51 |
|
48 | 52 |
for (String d: dateStrings) { |
49 |
Assert.notNull(importer.parseDate("0", d), "Could not parse " + d); |
|
53 |
Assert.assertNotNull("Could not parse " + d, importer.parseDate("0", d)); |
|
54 |
} |
|
55 |
} |
|
56 |
|
|
57 |
@Test |
|
58 |
public void testTypeSpecimenSplit(){ |
|
59 |
|
|
60 |
String[][] typeStrings = new String[][]{ |
|
61 |
new String[]{ |
|
62 |
"Type: Willershausen, ehemalige Ziegelei-Grube am Ostrand der Ortschaft. - Hellgraue, feingeschichtete Mergelsteinknollen, Pliozän.Holotype: STU P 1425.", |
|
63 |
"STU P 1425", |
|
64 |
""}, |
|
65 |
new String[]{ |
|
66 |
"Type: Armenia, Shirak distr. in vicinitate pag. Areg. m. Arteni in steppis tragacanthaceis, 1500-1700 m s.m. 9.4.1998, E. Gabrielian legitHolotype: ERE 146518. Isotype(s): B 147519-147520, LE 146520.", |
|
67 |
"ERE 146518.", |
|
68 |
"B 147519-147520, LE 146520."} |
|
69 |
}; |
|
70 |
for (String[] t: typeStrings) { |
|
71 |
Matcher m = importer.typeSpecimenSplitPattern.matcher(t[0]); |
|
72 |
assertTrue("typeSpecimenSplitPattern is not matching: " + t[0], m.matches()); |
|
73 |
if(!t[1].isEmpty()){ |
|
74 |
assertEquals(t[1], m.group("holotype").trim()); |
|
75 |
} |
|
76 |
if(!t[2].isEmpty()){ |
|
77 |
assertEquals(t[2], m.group("isotype").trim()); |
|
78 |
} |
|
50 | 79 |
} |
80 |
|
|
51 | 81 |
} |
52 | 82 |
|
53 | 83 |
@Test |
... | ... | |
76 | 106 |
|
77 | 107 |
}; |
78 | 108 |
for (String t: typeStrings) { |
79 |
Assert.notNull(importer.parseSpecimenType(fu, IAPTExcelImport.TypesName.holotype, collection, t, "0"), "Could not parse: " + t);
|
|
109 |
assertNotNull("Could not parse: " + t, importer.parseSpecimenType(fu, IAPTExcelImport.TypesName.holotype, collection, t, "0"));
|
|
80 | 110 |
} |
81 | 111 |
|
82 | 112 |
} |
113 |
|
|
114 |
@Test |
|
115 |
public void testParseFieldUnit(){ |
|
116 |
|
|
117 |
String[] typeStrings = new String[]{ |
|
118 |
"Lake Bungarby, (36°09'S, 149°08'E), south-eastern New South Wales. - leg. Greg Jordan, Graham Taylor & Leanne Dansie.", |
|
119 |
"Mt. Koghis, Nouvelle-Calédonie (leg. Moser et al., 06.03.1994).", |
|
120 |
"Salt marsh, Wladyslawowo, Puck Bay, Poland (leg. A. Witkowski, 1993).", |
|
121 |
"Blankaart, Woumen (Belgium), sediment sample Jun 1993, core III, 16-17 cm depth (leg. L. Denys, January 1997). In sediment and epiphyton.", |
|
122 |
"Rivière des Lacs, Cascade (Chutes de la Madeleine), Nouvelle-Calédonie (leg. Moser et al., 10.03.1994).", |
|
123 |
"Bulgaria austro-occidentalis. In graminosis saxosis prope vic. Strumesnitza, cca 120 m s.m., Petric district. Leg. D.Delipavlov 03.06.1987.", |
|
124 |
"Lesbos 152, mit sechs Schliffen, ein kleines Geröll mit einem Durchmesser von ca. 4,5 x 5,5 cm. - Strand von Lapsarna, nordwestlich von Antissa. Versteinerter Wald von Lesbos, Griechenland. - Tertiär, Oberoligozän/Untermiozän. - Leg.: E. Velitzelo", |
|
125 |
"Haute Hienghène, au nord-est de l'île Nouvelle-Calédonie. Expression de mousses (leg. Guillaumin).", |
|
126 |
"leg. J. J. Halda 18.3.1997" |
|
127 |
}; |
|
128 |
for (String t: typeStrings) { |
|
129 |
assertTrue("collectorPattern is not matching: " + t, importer.collectorPattern.matcher(t).matches()); |
|
130 |
} |
|
131 |
} |
|
83 | 132 |
} |
Also available in: Unified diff
ref #6026 more parser improvements and tests