Project

General

Profile

Revision bde99bfd

IDbde99bfdda8992a2555253ab7e720bcaff9da177
Parent 5c7f2505
Child c83f34f1

Added by Andreas Kohlbecker about 5 years ago

ref #6009 improved plublication date parsing

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
26 26
import org.apache.commons.lang.StringUtils;
27 27
import org.apache.log4j.Level;
28 28
import org.apache.log4j.Logger;
29
import org.joda.time.DateTimeFieldType;
30
import org.joda.time.Partial;
29 31
import org.springframework.stereotype.Component;
30 32

  
31 33
import java.util.*;
......
68 70
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
69 71
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
70 72

  
71
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$");
72
    private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999
73
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
74
    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
75
            // all patterns cover the years 1700 - 1999
76
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
77
            Pattern.compile("^(?<day>[0-9]{1,2})([\\./])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969
78
            Pattern.compile("^(?:(?<day>[0-9]{1,2})[\\./]?\\s)?(?<monthName>[\\S\\D]+)\\s(?<year>(?:1[7,8,9])?[0-9]{2})$") // full date like 12. April 1969 or april 1999 or April 99
79
        };
73 80
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
81

  
82
    private static Map<String, Integer> monthFromNameMap = new HashMap<>();
83
    static {
84
        String[] ck = new String[]{"leden", "únor", "březen", "duben", "květen", "červen", "červenec ", "srpen", "září", "říjen", "listopad", "prosinec"};
85
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
86
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
87
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
88

  
89
        String[][] perLang =  new String[][]{ck, de, fr, en};
90

  
91
        for (String[] months: perLang) {
92
            for(int m = 1; m < 13; m++){
93
                monthFromNameMap.put(months[m - 1], m);
94
            }
95
        }
96

  
97
        // special cases
98
        monthFromNameMap.put("Mar", 3);
99
    }
100

  
74 101
    enum TypesName {
75 102
        type, holotype, isotype;
76 103

  
......
110 137
        String nomRefTitle = null;
111 138
        String nomRefDetail = null;
112 139
        String nomRefPupDate = null;
140
        String nomRefPupDay = null;
141
        String nomRefPupMonth = null;
142
        String nomRefPupMonthName = null;
113 143
        String nomRefPupYear = null;
114 144

  
115 145
        // preprocess nomRef: separate citation, reference detail, publishing date
......
119 149
            if(m.matches()){
120 150
                nomRefTitle = m.group(1);
121 151
                nomRefDetail = m.group(2);
122
                nomRefPupDate = m.group(3);
152
                nomRefPupDate = m.group(3).trim();
123 153

  
124 154
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
125
                Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate);
126
                if(m2.matches()){
127
                    nomRefPupYear = m2.group(2);
128
                    if(nomRefPupYear == null){
129
                        nomRefPupYear = m2.group(4);
130
                    }
131
                    if(nomRefPupYear == null){
132
                        logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
155
                for(Pattern p : nomRefPubDatePs){
156
                    Matcher m2 = p.matcher(nomRefPupDate);
157
                    if(m2.matches()){
158
                        try {
159
                            nomRefPupYear = m2.group("year");
160
                        } catch (IllegalArgumentException e){
161
                            // named capture group not found
162
                        }
163
                        try {
164
                            nomRefPupMonth = m2.group("month");
165
                        } catch (IllegalArgumentException e){
166
                            // named capture group not found
167
                        }
168
                        try {
169
                            nomRefPupMonthName = m2.group("monthName");
170
                            nomRefPupMonth = monthFromName(nomRefPupMonthName);
171
                        } catch (IllegalArgumentException e){
172
                            // named capture group not found
173
                        }
174
                        try {
175
                            nomRefPupDay = m2.group("day");
176
                        } catch (IllegalArgumentException e){
177
                            // named capture group not found
178
                        }
179

  
180
                        if(nomRefPupYear == null){
181
                            logger.error("nomRefPupYear in " + nomRefStr + " is  NULL" );
182
                        }
183
                        if(nomRefPupYear.length() == 2 ){
184
                            // it is an abbreviated year from the 19** years
185
                            nomRefPupYear = "19" + nomRefPupYear;
186
                        }
187
                        nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
188
                        break;
133 189
                    }
134
                    if(nomRefPupYear.length() == 2 ){
135
                        // it is an abbreviated year from the 19** years
136
                        nomRefPupYear = "19" + nomRefPupYear;
137
                    }
138
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
139
                } else {
140
                    logger.warn("Pub year not found in " + nomRefStr );
190
                }
191
                if(nomRefPupYear == null){
192
                    logger.warn("Pub year not found in " + nomRefPupDate + " from " + nomRefStr );
141 193
                    // FIXME in in J. Eur. Orchideen 30: 128. 30.09.97 (Vorabdr.).
142

  
143 194
                }
195
                List<DateTimeFieldType> types = new ArrayList<>();
196
                List<Integer> values = new ArrayList<>();
197
                if(nomRefPupYear != null){
198
                    types.add(DateTimeFieldType.year());
199
                    values.add(Integer.parseInt(nomRefPupYear));
200
                }
201
                if(nomRefPupMonth != null){
202
                    types.add(DateTimeFieldType.monthOfYear());
203
                    values.add(Integer.parseInt(nomRefPupMonth));
204
                }
205
                if(nomRefPupDay != null){
206
                    types.add(DateTimeFieldType.dayOfMonth());
207
                    values.add(Integer.parseInt(nomRefPupDay));
208
                }
209
                Partial pupDate = new Partial(types.toArray(new DateTimeFieldType[types.size()]), ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])));
144 210

  
145 211
            } else {
146 212
                nomRefTitle = nomRefStr;
......
237 303

  
238 304
    }
239 305

  
306
    private String monthFromName(String monthName) {
307

  
308
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
309
        if(month == null){
310
            logger.warn("Unknown month: " + monthName);
311
            return null;
312
        } else {
313
            return month.toString();
314
        }
315
    }
316

  
317

  
240 318
    private void addSpecimenTypes(BotanicalName taxonName, FieldUnit fieldUnit, String typeStr, TypesName typeName, boolean multiple){
241 319
        if(StringUtils.isEmpty(typeStr)){
242 320
            return;
......
263 341
       }
264 342
    }
265 343

  
266
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr, String authorStr, String nomRefTitle) {
344
    private BotanicalName makeBotanicalName(SimpleExcelTaxonImportState<CONFIG> state, String titleCacheStr, String nameStr,
345
                                            String authorStr, String nomRefTitle) {
267 346

  
268 347
        BotanicalName taxonName;// cache field for the taxonName.titleCache
269 348
        String taxonNameTitleCache = null;

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)