Revision bde99bfd
Added by Andreas Kohlbecker over 7 years ago
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java | ||
---|---|---|
26 | 26 |
import org.apache.commons.lang.StringUtils; |
27 | 27 |
import org.apache.log4j.Level; |
28 | 28 |
import org.apache.log4j.Logger; |
29 |
import org.joda.time.DateTimeFieldType; |
|
30 |
import org.joda.time.Partial; |
|
29 | 31 |
import org.springframework.stereotype.Component; |
30 | 32 |
|
31 | 33 |
import java.util.*; |
... | ... | |
68 | 70 |
private static List<String> expectedKeys= Arrays.asList(new String[]{ |
69 | 71 |
REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING}); |
70 | 72 |
|
71 |
private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$"); |
|
72 |
private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999 |
|
73 |
private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$"); |
|
74 |
private static final Pattern[] nomRefPubDatePs = new Pattern[]{ |
|
75 |
// all patterns cover the years 1700 - 1999 |
|
76 |
Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969' |
|
77 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\./])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 |
|
78 |
Pattern.compile("^(?:(?<day>[0-9]{1,2})[\\./]?\\s)?(?<monthName>[\\S\\D]+)\\s(?<year>(?:1[7,8,9])?[0-9]{2})$") // full date like 12. April 1969 or april 1999 or April 99 |
|
79 |
}; |
|
73 | 80 |
private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$"); |
81 |
|
|
82 |
private static Map<String, Integer> monthFromNameMap = new HashMap<>(); |
|
83 |
static { |
|
84 |
String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"}; |
|
85 |
String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"}; |
|
86 |
String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"}; |
|
87 |
String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"}; |
|
88 |
|
|
89 |
String[][] perLang = new String[][]{ck, de, fr, en}; |
|
90 |
|
|
91 |
for (String[] months: perLang) { |
|
92 |
for(int m = 1; m < 13; m++){ |
|
93 |
monthFromNameMap.put(months[m - 1], m); |
|
94 |
} |
|
95 |
} |
|
96 |
|
|
97 |
// special cases |
|
98 |
monthFromNameMap.put("Mar", 3); |
|
99 |
} |
|
100 |
|
|
74 | 101 |
enum TypesName { |
75 | 102 |
type, holotype, isotype; |
76 | 103 |
|
... | ... | |
110 | 137 |
String nomRefTitle = null; |
111 | 138 |
String nomRefDetail = null; |
112 | 139 |
String nomRefPupDate = null; |
140 |
String nomRefPupDay = null; |
|
141 |
String nomRefPupMonth = null; |
|
142 |
String nomRefPupMonthName = null; |
|
113 | 143 |
String nomRefPupYear = null; |
114 | 144 |
|
115 | 145 |
// preprocess nomRef: separate citation, reference detail, publishing date |
... | ... | |
119 | 149 |
if(m.matches()){ |
120 | 150 |
nomRefTitle = m.group(1); |
121 | 151 |
nomRefDetail = m.group(2); |
122 |
nomRefPupDate = m.group(3); |
|
152 |
nomRefPupDate = m.group(3).trim();
|
|
123 | 153 |
|
124 | 154 |
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP |
125 |
Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate); |
|
126 |
if(m2.matches()){ |
|
127 |
nomRefPupYear = m2.group(2); |
|
128 |
if(nomRefPupYear == null){ |
|
129 |
nomRefPupYear = m2.group(4); |
|
130 |
} |
|
131 |
if(nomRefPupYear == null){ |
|
132 |
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" ); |
|
155 |
for(Pattern p : nomRefPubDatePs){ |
|
156 |
Matcher m2 = p.matcher(nomRefPupDate); |
|
157 |
if(m2.matches()){ |
|
158 |
try { |
|
159 |
nomRefPupYear = m2.group("year"); |
|
160 |
} catch (IllegalArgumentException e){ |
|
161 |
// named capture group not found |
|
162 |
} |
|
163 |
try { |
|
164 |
nomRefPupMonth = m2.group("month"); |
|
165 |
} catch (IllegalArgumentException e){ |
|
166 |
// named capture group not found |
|
167 |
} |
|
168 |
try { |
|
169 |
nomRefPupMonthName = m2.group("monthName"); |
|
170 |
nomRefPupMonth = monthFromName(nomRefPupMonthName); |
|
171 |
} catch (IllegalArgumentException e){ |
|
172 |
// named capture group not found |
|
173 |
} |
|
174 |
try { |
|
175 |
nomRefPupDay = m2.group("day"); |
|
176 |
} catch (IllegalArgumentException e){ |
|
177 |
// named capture group not found |
|
178 |
} |
|
179 |
|
|
180 |
if(nomRefPupYear == null){ |
|
181 |
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" ); |
|
182 |
} |
|
183 |
if(nomRefPupYear.length() == 2 ){ |
|
184 |
// it is an abbreviated year from the 19** years |
|
185 |
nomRefPupYear = "19" + nomRefPupYear; |
|
186 |
} |
|
187 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + "."; |
|
188 |
break; |
|
133 | 189 |
} |
134 |
if(nomRefPupYear.length() == 2 ){ |
|
135 |
// it is an abbreviated year from the 19** years |
|
136 |
nomRefPupYear = "19" + nomRefPupYear; |
|
137 |
} |
|
138 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + "."; |
|
139 |
} else { |
|
140 |
logger.warn("Pub year not found in " + nomRefStr ); |
|
190 |
} |
|
191 |
if(nomRefPupYear == null){ |
|
192 |
logger.warn("Pub year not found in " + nomRefPupDate + " from " + nomRefStr ); |
|
141 | 193 |
// FIXME in in J. Eur. Orchideen 30: 128. 30.09.97 (Vorabdr.). |
142 |
|
|
143 | 194 |
} |
195 |
List<DateTimeFieldType> types = new ArrayList<>(); |
|
196 |
List<Integer> values = new ArrayList<>(); |
|
197 |
if(nomRefPupYear != null){ |
|
198 |
types.add(DateTimeFieldType.year()); |
|
199 |
values.add(Integer.parseInt(nomRefPupYear)); |
|
200 |
} |
|
201 |
if(nomRefPupMonth != null){ |
|
202 |
types.add(DateTimeFieldType.monthOfYear()); |
|
203 |
values.add(Integer.parseInt(nomRefPupMonth)); |
|
204 |
} |
|
205 |
if(nomRefPupDay != null){ |
|
206 |
types.add(DateTimeFieldType.dayOfMonth()); |
|
207 |
values.add(Integer.parseInt(nomRefPupDay)); |
|
208 |
} |
|
209 |
Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()]))); |
|
144 | 210 |
|
145 | 211 |
} else { |
146 | 212 |
nomRefTitle = nomRefStr; |
... | ... | |
237 | 303 |
|
238 | 304 |
} |
239 | 305 |
|
306 |
private String monthFromName(String monthName) { |
|
307 |
|
|
308 |
Integer month = monthFromNameMap.get(monthName.toLowerCase()); |
|
309 |
if(month == null){ |
|
310 |
logger.warn("Unknown month: " + monthName); |
|
311 |
return null; |
|
312 |
} else { |
|
313 |
return month.toString(); |
|
314 |
} |
|
315 |
} |
|
316 |
|
|
317 |
|
|
240 | 318 |
private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple){ |
241 | 319 |
if(StringUtils.isEmpty(typeStr)){ |
242 | 320 |
return; |
... | ... | |
263 | 341 |
} |
264 | 342 |
} |
265 | 343 |
|
266 |
private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr, String authorStr, String nomRefTitle) { |
|
344 |
private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr, |
|
345 |
String authorStr, String nomRefTitle) { |
|
267 | 346 |
|
268 | 347 |
BotanicalName taxonName;// cache field for the taxonName.titleCache |
269 | 348 |
String taxonNameTitleCache = null; |
Also available in: Unified diff
ref #6009 improved plublication date parsing