Project

General

Profile

« Previous | Next » 

Revision f465df63

Added by Andreas Kohlbecker over 7 years ago

ref #6026 publication date parsing completed

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
28 28
import org.apache.log4j.Logger;
29 29
import org.joda.time.DateTimeFieldType;
30 30
import org.joda.time.Partial;
31
import org.joda.time.format.DateTimeFormat;
32
import org.joda.time.format.DateTimeFormatter;
31 33
import org.springframework.stereotype.Component;
32 34

  
33 35
import java.util.*;
......
81 83
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
82 84
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
83 85
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
86
            Pattern.compile("^(?<year>(?:1[7,8,9])?[0-9]{2})([\\.\\-/])(?<month>[0-1]?[0-9])$"),//  partial date like 1999-04
84 87
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
85 88
        };
86 89
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
......
112 115
        monthFromNameMap.put("Februari", 2);
113 116
    }
114 117

  
118
    DateTimeFormatter formatterYear = DateTimeFormat.forPattern("yyyy");
119

  
115 120
    enum TypesName {
116 121
        type, holotype, isotype;
117 122

  
......
138 143
        String line = state.getCurrentLine() + ": ";
139 144

  
140 145
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
146
        String regStr = getValue(record, REGISTRATION, true);
141 147
        String titleCacheStr = getValue(record, FULLNAME, true);
142 148
        String nameStr = getValue(record, NAMESTRING, true);
143 149
        String authorStr = getValue(record, AUTHORSTRING, true);
......
149 155
        String synSubstStr = getValue(record, SYNSUBSTSTR, true);
150 156
        String typeStr = getValue(record, TYPE, true);
151 157

  
158

  
152 159
        String nomRefTitle = null;
153
        String nomRefDetail = null;
160
        String nomRefDetail;
154 161
        String nomRefPupDate = null;
155
        String nomRefPupDay = null;
156
        String nomRefPupMonth = null;
157
        String nomRefPupMonthName = null;
158
        String nomRefPupYear = null;
162
        Partial pupDate = null;
159 163

  
160 164
        // preprocess nomRef: separate citation, reference detail, publishing date
161 165
        if(!StringUtils.isEmpty(nomRefStr)){
......
166 170
                nomRefDetail = m.group(2);
167 171
                nomRefPupDate = m.group(3).trim();
168 172

  
169
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
170
                for(Pattern p : nomRefPubDatePs){
171
                    Matcher m2 = p.matcher(nomRefPupDate);
172
                    if(m2.matches()){
173
                        try {
174
                            nomRefPupYear = m2.group("year");
175
                        } catch (IllegalArgumentException e){
176
                            // named capture group not found
177
                        }
178
                        try {
179
                            nomRefPupMonth = m2.group("month");
180
                        } catch (IllegalArgumentException e){
181
                            // named capture group not found
182
                        }
183
                        try {
184
                            nomRefPupMonthName = m2.group("monthName");
185
                            nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
186
                        } catch (IllegalArgumentException e){
187
                            // named capture group not found
188
                        }
189
                        try {
190
                            nomRefPupDay = m2.group("day");
191
                        } catch (IllegalArgumentException e){
192
                            // named capture group not found
193
                        }
194

  
195
                        if(nomRefPupYear == null){
196
                            logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
197
                        }
198
                        if(nomRefPupYear.length() == 2 ){
199
                            // it is an abbreviated year from the 19** years
200
                            nomRefPupYear = "19" + nomRefPupYear;
201
                        }
202
                        nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
203
                        break;
204
                    }
205
                }
206
                if(nomRefPupYear == null){
207
                    logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
208
                }
209
                List<DateTimeFieldType> types = new ArrayList<>();
210
                List<Integer> values = new ArrayList<>();
211
                if(nomRefPupYear != null){
212
                    types.add(DateTimeFieldType.year());
213
                    values.add(Integer.parseInt(nomRefPupYear));
214
                }
215
                if(nomRefPupMonth != null){
216
                    types.add(DateTimeFieldType.monthOfYear());
217
                    values.add(Integer.parseInt(nomRefPupMonth));
173
                pupDate = parsePubDate(regNumber, nomRefStr, nomRefPupDate);
174
                if (pupDate != null) {
175
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + pupDate.toString(formatterYear) + ".";
218 176
                }
219
                if(nomRefPupDay != null){
220
                    types.add(DateTimeFieldType.dayOfMonth());
221
                    values.add(Integer.parseInt(nomRefPupDay));
222
                }
223
                Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
224

  
225 177
            } else {
226 178
                nomRefTitle = nomRefStr;
227 179
            }
......
229 181

  
230 182
        BotanicalName taxonName = makeBotanicalName(state, titleCacheStr, nameStr, authorStr, nomRefTitle);
231 183

  
184
        // always add the original strings of parsed data as annotation
185
        taxonName.addAnnotation(Annotation.NewInstance("imported and parsed data strings:" +
186
                        "\n -  '" + LITSTRING + "': "+ nomRefStr +
187
                        "\n -  '" + TYPE + "': " + typeStr +
188
                        "\n -  '" + REGISTRATION  + "': " + regStr
189
                , AnnotationType.TECHNICAL(), Language.DEFAULT()));
190

  
191
        if(pupDate != null) {
192
            taxonName.getNomenclaturalReference().setDatePublished(TimePeriod.NewInstance(pupDate));
193
        }
194

  
232 195
        if(!StringUtils.isEmpty(notesTxt)){
233 196
            notesTxt = notesTxt.replace("Notes: ", "").trim();
234 197
            taxonName.addAnnotation(Annotation.NewInstance(notesTxt, AnnotationType.EDITORIAL(), Language.DEFAULT()));
......
317 280

  
318 281
    }
319 282

  
283
    private Partial parsePubDate(String regNumber, String nomRefStr, String nomRefPupDate) {
284

  
285
        Partial pupDate = null;
286
        boolean parseError = false;
287
        String nomRefPupDay = null;
288
        String nomRefPupMonth = null;
289
        String nomRefPupMonthName = null;
290
        String nomRefPupYear = null;
291

  
292

  
293
        // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
294
        for(Pattern p : nomRefPubDatePs){
295
            Matcher m2 = p.matcher(nomRefPupDate);
296
            if(m2.matches()){
297
                try {
298
                    nomRefPupYear = m2.group("year");
299
                } catch (IllegalArgumentException e){
300
                    // named capture group not found
301
                }
302
                try {
303
                    nomRefPupMonth = m2.group("month");
304
                } catch (IllegalArgumentException e){
305
                    // named capture group not found
306
                }
307
                try {
308
                    nomRefPupMonthName = m2.group("monthName");
309
                    nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
310
                    if(nomRefPupMonth == null){
311
                        parseError = true;
312
                    }
313
                } catch (IllegalArgumentException e){
314
                    // named capture group not found
315
                }
316
                try {
317
                    nomRefPupDay = m2.group("day");
318
                } catch (IllegalArgumentException e){
319
                    // named capture group not found
320
                }
321

  
322
                if(nomRefPupYear == null){
323
                    logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
324
                    parseError = true;
325
                }
326
                if(nomRefPupYear.length() == 2 ){
327
                    // it is an abbreviated year from the 19** years
328
                    nomRefPupYear = "19" + nomRefPupYear;
329
                }
330

  
331
                break;
332
            }
333
        }
334
        if(nomRefPupYear == null){
335
            logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
336
            parseError = true;
337
        }
338
        List<DateTimeFieldType> types = new ArrayList<>();
339
        List<Integer> values = new ArrayList<>();
340
        if(!parseError) {
341
            types.add(DateTimeFieldType.year());
342
            values.add(Integer.parseInt(nomRefPupYear));
343
            if (nomRefPupMonth != null) {
344
                types.add(DateTimeFieldType.monthOfYear());
345
                values.add(Integer.parseInt(nomRefPupMonth));
346
            }
347
            if (nomRefPupDay != null) {
348
                types.add(DateTimeFieldType.dayOfMonth());
349
                values.add(Integer.parseInt(nomRefPupDay));
350
            }
351
            pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
352
        }
353
        return pupDate;
354
    }
355

  
320 356
    private String monthFromName(String monthName, String regNumber) {
321 357

  
322 358
        Integer month = monthFromNameMap.get(monthName.toLowerCase());

Also available in: Unified diff