Revision f465df63
Added by Andreas Kohlbecker over 7 years ago
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java | ||
---|---|---|
28 | 28 |
import org.apache.log4j.Logger; |
29 | 29 |
import org.joda.time.DateTimeFieldType; |
30 | 30 |
import org.joda.time.Partial; |
31 |
import org.joda.time.format.DateTimeFormat; |
|
32 |
import org.joda.time.format.DateTimeFormatter; |
|
31 | 33 |
import org.springframework.stereotype.Component; |
32 | 34 |
|
33 | 35 |
import java.util.*; |
... | ... | |
81 | 83 |
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12 |
82 | 84 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969 |
83 | 85 |
Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969 |
86 |
Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04 |
|
84 | 87 |
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999 |
85 | 88 |
}; |
86 | 89 |
private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$"); |
... | ... | |
112 | 115 |
monthFromNameMap.put("Februari", 2); |
113 | 116 |
} |
114 | 117 |
|
118 |
DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy"); |
|
119 |
|
|
115 | 120 |
enum TypesName { |
116 | 121 |
type, holotype, isotype; |
117 | 122 |
|
... | ... | |
138 | 143 |
String line = state.getCurrentLine() + ": "; |
139 | 144 |
|
140 | 145 |
String regNumber = getValue(record, REGISTRATIONNO_PK, false); |
146 |
String regStr = getValue(record, REGISTRATION, true); |
|
141 | 147 |
String titleCacheStr = getValue(record, FULLNAME, true); |
142 | 148 |
String nameStr = getValue(record, NAMESTRING, true); |
143 | 149 |
String authorStr = getValue(record, AUTHORSTRING, true); |
... | ... | |
149 | 155 |
String synSubstStr = getValue(record, SYNSUBSTSTR, true); |
150 | 156 |
String typeStr = getValue(record, TYPE, true); |
151 | 157 |
|
158 |
|
|
152 | 159 |
String nomRefTitle = null; |
153 |
String nomRefDetail = null;
|
|
160 |
String nomRefDetail; |
|
154 | 161 |
String nomRefPupDate = null; |
155 |
String nomRefPupDay = null; |
|
156 |
String nomRefPupMonth = null; |
|
157 |
String nomRefPupMonthName = null; |
|
158 |
String nomRefPupYear = null; |
|
162 |
Partial pupDate = null; |
|
159 | 163 |
|
160 | 164 |
// preprocess nomRef: separate citation, reference detail, publishing date |
161 | 165 |
if(!StringUtils.isEmpty(nomRefStr)){ |
... | ... | |
166 | 170 |
nomRefDetail = m.group(2); |
167 | 171 |
nomRefPupDate = m.group(3).trim(); |
168 | 172 |
|
169 |
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP |
|
170 |
for(Pattern p : nomRefPubDatePs){ |
|
171 |
Matcher m2 = p.matcher(nomRefPupDate); |
|
172 |
if(m2.matches()){ |
|
173 |
try { |
|
174 |
nomRefPupYear = m2.group("year"); |
|
175 |
} catch (IllegalArgumentException e){ |
|
176 |
// named capture group not found |
|
177 |
} |
|
178 |
try { |
|
179 |
nomRefPupMonth = m2.group("month"); |
|
180 |
} catch (IllegalArgumentException e){ |
|
181 |
// named capture group not found |
|
182 |
} |
|
183 |
try { |
|
184 |
nomRefPupMonthName = m2.group("monthName"); |
|
185 |
nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber); |
|
186 |
} catch (IllegalArgumentException e){ |
|
187 |
// named capture group not found |
|
188 |
} |
|
189 |
try { |
|
190 |
nomRefPupDay = m2.group("day"); |
|
191 |
} catch (IllegalArgumentException e){ |
|
192 |
// named capture group not found |
|
193 |
} |
|
194 |
|
|
195 |
if(nomRefPupYear == null){ |
|
196 |
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" ); |
|
197 |
} |
|
198 |
if(nomRefPupYear.length() == 2 ){ |
|
199 |
// it is an abbreviated year from the 19** years |
|
200 |
nomRefPupYear = "19" + nomRefPupYear; |
|
201 |
} |
|
202 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + "."; |
|
203 |
break; |
|
204 |
} |
|
205 |
} |
|
206 |
if(nomRefPupYear == null){ |
|
207 |
logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr ); |
|
208 |
} |
|
209 |
List<DateTimeFieldType> types = new ArrayList<>(); |
|
210 |
List<Integer> values = new ArrayList<>(); |
|
211 |
if(nomRefPupYear != null){ |
|
212 |
types.add(DateTimeFieldType.year()); |
|
213 |
values.add(Integer.parseInt(nomRefPupYear)); |
|
214 |
} |
|
215 |
if(nomRefPupMonth != null){ |
|
216 |
types.add(DateTimeFieldType.monthOfYear()); |
|
217 |
values.add(Integer.parseInt(nomRefPupMonth)); |
|
173 |
pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate); |
|
174 |
if (pupDate != null) { |
|
175 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + "."; |
|
218 | 176 |
} |
219 |
if(nomRefPupDay != null){ |
|
220 |
types.add(DateTimeFieldType.dayOfMonth()); |
|
221 |
values.add(Integer.parseInt(nomRefPupDay)); |
|
222 |
} |
|
223 |
Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()]))); |
|
224 |
|
|
225 | 177 |
} else { |
226 | 178 |
nomRefTitle = nomRefStr; |
227 | 179 |
} |
... | ... | |
229 | 181 |
|
230 | 182 |
BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle); |
231 | 183 |
|
184 |
// always add the original strings of parsed data as annotation |
|
185 |
taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" + |
|
186 |
"\n - '" + LITSTRING + "': "+ nomRefStr + |
|
187 |
"\n - '" + TYPE + "': " + typeStr + |
|
188 |
"\n - '" + REGISTRATION + "': " + regStr |
|
189 |
, AnnotationType.TECHNICAL(), Language.DEFAULT())); |
|
190 |
|
|
191 |
if(pupDate != null) { |
|
192 |
taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate)); |
|
193 |
} |
|
194 |
|
|
232 | 195 |
if(!StringUtils.isEmpty(notesTxt)){ |
233 | 196 |
notesTxt = notesTxt.replace("Notes: ", "").trim(); |
234 | 197 |
taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT())); |
... | ... | |
317 | 280 |
|
318 | 281 |
} |
319 | 282 |
|
283 |
private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) { |
|
284 |
|
|
285 |
Partial pupDate = null; |
|
286 |
boolean parseError = false; |
|
287 |
String nomRefPupDay = null; |
|
288 |
String nomRefPupMonth = null; |
|
289 |
String nomRefPupMonthName = null; |
|
290 |
String nomRefPupYear = null; |
|
291 |
|
|
292 |
|
|
293 |
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP |
|
294 |
for(Pattern p : nomRefPubDatePs){ |
|
295 |
Matcher m2 = p.matcher(nomRefPupDate); |
|
296 |
if(m2.matches()){ |
|
297 |
try { |
|
298 |
nomRefPupYear = m2.group("year"); |
|
299 |
} catch (IllegalArgumentException e){ |
|
300 |
// named capture group not found |
|
301 |
} |
|
302 |
try { |
|
303 |
nomRefPupMonth = m2.group("month"); |
|
304 |
} catch (IllegalArgumentException e){ |
|
305 |
// named capture group not found |
|
306 |
} |
|
307 |
try { |
|
308 |
nomRefPupMonthName = m2.group("monthName"); |
|
309 |
nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber); |
|
310 |
if(nomRefPupMonth == null){ |
|
311 |
parseError = true; |
|
312 |
} |
|
313 |
} catch (IllegalArgumentException e){ |
|
314 |
// named capture group not found |
|
315 |
} |
|
316 |
try { |
|
317 |
nomRefPupDay = m2.group("day"); |
|
318 |
} catch (IllegalArgumentException e){ |
|
319 |
// named capture group not found |
|
320 |
} |
|
321 |
|
|
322 |
if(nomRefPupYear == null){ |
|
323 |
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" ); |
|
324 |
parseError = true; |
|
325 |
} |
|
326 |
if(nomRefPupYear.length() == 2 ){ |
|
327 |
// it is an abbreviated year from the 19** years |
|
328 |
nomRefPupYear = "19" + nomRefPupYear; |
|
329 |
} |
|
330 |
|
|
331 |
break; |
|
332 |
} |
|
333 |
} |
|
334 |
if(nomRefPupYear == null){ |
|
335 |
logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr ); |
|
336 |
parseError = true; |
|
337 |
} |
|
338 |
List<DateTimeFieldType> types = new ArrayList<>(); |
|
339 |
List<Integer> values = new ArrayList<>(); |
|
340 |
if(!parseError) { |
|
341 |
types.add(DateTimeFieldType.year()); |
|
342 |
values.add(Integer.parseInt(nomRefPupYear)); |
|
343 |
if (nomRefPupMonth != null) { |
|
344 |
types.add(DateTimeFieldType.monthOfYear()); |
|
345 |
values.add(Integer.parseInt(nomRefPupMonth)); |
|
346 |
} |
|
347 |
if (nomRefPupDay != null) { |
|
348 |
types.add(DateTimeFieldType.dayOfMonth()); |
|
349 |
values.add(Integer.parseInt(nomRefPupDay)); |
|
350 |
} |
|
351 |
pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()]))); |
|
352 |
} |
|
353 |
return pupDate; |
|
354 |
} |
|
355 |
|
|
320 | 356 |
private String monthFromName(String monthName, String regNumber) { |
321 | 357 |
|
322 | 358 |
Integer month = monthFromNameMap.get(monthName.toLowerCase()); |
Also available in: Unified diff
ref #6026 publication date parsing completed