28 |
28 |
import org.apache.log4j.Logger;
|
29 |
29 |
import org.joda.time.DateTimeFieldType;
|
30 |
30 |
import org.joda.time.Partial;
|
|
31 |
import org.joda.time.format.DateTimeFormat;
|
|
32 |
import org.joda.time.format.DateTimeFormatter;
|
31 |
33 |
import org.springframework.stereotype.Component;
|
32 |
34 |
|
33 |
35 |
import java.util.*;
|
... | ... | |
81 |
83 |
Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
|
82 |
84 |
Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
|
83 |
85 |
Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
|
|
86 |
Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),// partial date like 1999-04
|
84 |
87 |
Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
|
85 |
88 |
};
|
86 |
89 |
private static final Pattern typeSplitPattern = Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
|
... | ... | |
112 |
115 |
monthFromNameMap.put("Februari", 2);
|
113 |
116 |
}
|
114 |
117 |
|
|
118 |
DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
|
|
119 |
|
115 |
120 |
enum TypesName {
|
116 |
121 |
type, holotype, isotype;
|
117 |
122 |
|
... | ... | |
138 |
143 |
String line = state.getCurrentLine() + ": ";
|
139 |
144 |
|
140 |
145 |
String regNumber = getValue(record, REGISTRATIONNO_PK, false);
|
|
146 |
String regStr = getValue(record, REGISTRATION, true);
|
141 |
147 |
String titleCacheStr = getValue(record, FULLNAME, true);
|
142 |
148 |
String nameStr = getValue(record, NAMESTRING, true);
|
143 |
149 |
String authorStr = getValue(record, AUTHORSTRING, true);
|
... | ... | |
149 |
155 |
String synSubstStr = getValue(record, SYNSUBSTSTR, true);
|
150 |
156 |
String typeStr = getValue(record, TYPE, true);
|
151 |
157 |
|
|
158 |
|
152 |
159 |
String nomRefTitle = null;
|
153 |
|
String nomRefDetail = null;
|
|
160 |
String nomRefDetail;
|
154 |
161 |
String nomRefPupDate = null;
|
155 |
|
String nomRefPupDay = null;
|
156 |
|
String nomRefPupMonth = null;
|
157 |
|
String nomRefPupMonthName = null;
|
158 |
|
String nomRefPupYear = null;
|
|
162 |
Partial pupDate = null;
|
159 |
163 |
|
160 |
164 |
// preprocess nomRef: separate citation, reference detail, publishing date
|
161 |
165 |
if(!StringUtils.isEmpty(nomRefStr)){
|
... | ... | |
166 |
170 |
nomRefDetail = m.group(2);
|
167 |
171 |
nomRefPupDate = m.group(3).trim();
|
168 |
172 |
|
169 |
|
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
|
170 |
|
for(Pattern p : nomRefPubDatePs){
|
171 |
|
Matcher m2 = p.matcher(nomRefPupDate);
|
172 |
|
if(m2.matches()){
|
173 |
|
try {
|
174 |
|
nomRefPupYear = m2.group("year");
|
175 |
|
} catch (IllegalArgumentException e){
|
176 |
|
// named capture group not found
|
177 |
|
}
|
178 |
|
try {
|
179 |
|
nomRefPupMonth = m2.group("month");
|
180 |
|
} catch (IllegalArgumentException e){
|
181 |
|
// named capture group not found
|
182 |
|
}
|
183 |
|
try {
|
184 |
|
nomRefPupMonthName = m2.group("monthName");
|
185 |
|
nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
|
186 |
|
} catch (IllegalArgumentException e){
|
187 |
|
// named capture group not found
|
188 |
|
}
|
189 |
|
try {
|
190 |
|
nomRefPupDay = m2.group("day");
|
191 |
|
} catch (IllegalArgumentException e){
|
192 |
|
// named capture group not found
|
193 |
|
}
|
194 |
|
|
195 |
|
if(nomRefPupYear == null){
|
196 |
|
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" );
|
197 |
|
}
|
198 |
|
if(nomRefPupYear.length() == 2 ){
|
199 |
|
// it is an abbreviated year from the 19** years
|
200 |
|
nomRefPupYear = "19" + nomRefPupYear;
|
201 |
|
}
|
202 |
|
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
|
203 |
|
break;
|
204 |
|
}
|
205 |
|
}
|
206 |
|
if(nomRefPupYear == null){
|
207 |
|
logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
|
208 |
|
}
|
209 |
|
List<DateTimeFieldType> types = new ArrayList<>();
|
210 |
|
List<Integer> values = new ArrayList<>();
|
211 |
|
if(nomRefPupYear != null){
|
212 |
|
types.add(DateTimeFieldType.year());
|
213 |
|
values.add(Integer.parseInt(nomRefPupYear));
|
214 |
|
}
|
215 |
|
if(nomRefPupMonth != null){
|
216 |
|
types.add(DateTimeFieldType.monthOfYear());
|
217 |
|
values.add(Integer.parseInt(nomRefPupMonth));
|
|
173 |
pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate);
|
|
174 |
if (pupDate != null) {
|
|
175 |
nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
|
218 |
176 |
}
|
219 |
|
if(nomRefPupDay != null){
|
220 |
|
types.add(DateTimeFieldType.dayOfMonth());
|
221 |
|
values.add(Integer.parseInt(nomRefPupDay));
|
222 |
|
}
|
223 |
|
Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
|
224 |
|
|
225 |
177 |
} else {
|
226 |
178 |
nomRefTitle = nomRefStr;
|
227 |
179 |
}
|
... | ... | |
229 |
181 |
|
230 |
182 |
BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle);
|
231 |
183 |
|
|
184 |
// always add the original strings of parsed data as annotation
|
|
185 |
taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
|
|
186 |
"\n - '" + LITSTRING + "': "+ nomRefStr +
|
|
187 |
"\n - '" + TYPE + "': " + typeStr +
|
|
188 |
"\n - '" + REGISTRATION + "': " + regStr
|
|
189 |
, AnnotationType.TECHNICAL(), Language.DEFAULT()));
|
|
190 |
|
|
191 |
if(pupDate != null) {
|
|
192 |
taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
|
|
193 |
}
|
|
194 |
|
232 |
195 |
if(!StringUtils.isEmpty(notesTxt)){
|
233 |
196 |
notesTxt = notesTxt.replace("Notes: ", "").trim();
|
234 |
197 |
taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
|
... | ... | |
317 |
280 |
|
318 |
281 |
}
|
319 |
282 |
|
|
283 |
private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
|
|
284 |
|
|
285 |
Partial pupDate = null;
|
|
286 |
boolean parseError = false;
|
|
287 |
String nomRefPupDay = null;
|
|
288 |
String nomRefPupMonth = null;
|
|
289 |
String nomRefPupMonthName = null;
|
|
290 |
String nomRefPupYear = null;
|
|
291 |
|
|
292 |
|
|
293 |
// nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
|
|
294 |
for(Pattern p : nomRefPubDatePs){
|
|
295 |
Matcher m2 = p.matcher(nomRefPupDate);
|
|
296 |
if(m2.matches()){
|
|
297 |
try {
|
|
298 |
nomRefPupYear = m2.group("year");
|
|
299 |
} catch (IllegalArgumentException e){
|
|
300 |
// named capture group not found
|
|
301 |
}
|
|
302 |
try {
|
|
303 |
nomRefPupMonth = m2.group("month");
|
|
304 |
} catch (IllegalArgumentException e){
|
|
305 |
// named capture group not found
|
|
306 |
}
|
|
307 |
try {
|
|
308 |
nomRefPupMonthName = m2.group("monthName");
|
|
309 |
nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
|
|
310 |
if(nomRefPupMonth == null){
|
|
311 |
parseError = true;
|
|
312 |
}
|
|
313 |
} catch (IllegalArgumentException e){
|
|
314 |
// named capture group not found
|
|
315 |
}
|
|
316 |
try {
|
|
317 |
nomRefPupDay = m2.group("day");
|
|
318 |
} catch (IllegalArgumentException e){
|
|
319 |
// named capture group not found
|
|
320 |
}
|
|
321 |
|
|
322 |
if(nomRefPupYear == null){
|
|
323 |
logger.error("nomRefPupYear in " + nomRefStr + " is NULL" );
|
|
324 |
parseError = true;
|
|
325 |
}
|
|
326 |
if(nomRefPupYear.length() == 2 ){
|
|
327 |
// it is an abbreviated year from the 19** years
|
|
328 |
nomRefPupYear = "19" + nomRefPupYear;
|
|
329 |
}
|
|
330 |
|
|
331 |
break;
|
|
332 |
}
|
|
333 |
}
|
|
334 |
if(nomRefPupYear == null){
|
|
335 |
logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
|
|
336 |
parseError = true;
|
|
337 |
}
|
|
338 |
List<DateTimeFieldType> types = new ArrayList<>();
|
|
339 |
List<Integer> values = new ArrayList<>();
|
|
340 |
if(!parseError) {
|
|
341 |
types.add(DateTimeFieldType.year());
|
|
342 |
values.add(Integer.parseInt(nomRefPupYear));
|
|
343 |
if (nomRefPupMonth != null) {
|
|
344 |
types.add(DateTimeFieldType.monthOfYear());
|
|
345 |
values.add(Integer.parseInt(nomRefPupMonth));
|
|
346 |
}
|
|
347 |
if (nomRefPupDay != null) {
|
|
348 |
types.add(DateTimeFieldType.dayOfMonth());
|
|
349 |
values.add(Integer.parseInt(nomRefPupDay));
|
|
350 |
}
|
|
351 |
pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
|
|
352 |
}
|
|
353 |
return pupDate;
|
|
354 |
}
|
|
355 |
|
320 |
356 |
private String monthFromName(String monthName, String regNumber) {
|
321 |
357 |
|
322 |
358 |
Integer month = monthFromNameMap.get(monthName.toLowerCase());
|
ref #6026 publication date parsing completed