Project

General

Profile

Revision c83f34f1

IDc83f34f187800b66d14a90c503f1b886539c2654
Parent bde99bfd
Child f465df63

Added by Andreas Kohlbecker almost 4 years ago

ref #6009 improved plublication date parsing
- all dates recognized
- all months recognized
- special time spans not implemented (Winter, Fall, April-June, etc)

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/app/iapt/IAPTActivator.java
38 38
    public static final String DATA_FILE_0_100 = "fileURI-100.xls";
39 39
    public static final String DATA_FILE_FULL = "Registration_DB_from_BGBM17.xls";
40 40
    public static final String DATA_ENCODING_PROBLEMS = "encoding-problems.xls";
41
    public static final String DATA_FILE = DATA_ENCODING_PROBLEMS;
41
    public static final String DATA_FILE = DATA_FILE_FULL;
42 42

  
43 43
    //database validation status (create, update, validate ...)
44 44
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
72 72

  
73 73
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*?)\\.?$");
74 74
    private static final Pattern[] nomRefPubDatePs = new Pattern[]{
75
            // NOTE:
76
            // The order of the patterns is extremely important!!!
77
            //
75 78
            // all patterns cover the years 1700 - 1999
76 79
            Pattern.compile("^(?<year>1[7,8,9][0-9]{2})$"), // only year, like '1969'
77
            Pattern.compile("^(?<day>[0-9]{1,2})([\\./])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969
78
            Pattern.compile("^(?:(?<day>[0-9]{1,2})[\\./]?\\s)?(?<monthName>[\\S\\D]+)\\s(?<year>(?:1[7,8,9])?[0-9]{2})$") // full date like 12. April 1969 or april 1999 or April 99
80
            Pattern.compile("^(?<monthName>\\p{L}+\\.?)\\s(?<day>[0-9]{1,2})(?:st|rd|th)?\\.?,?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like April 12, 1969 or april 12th 1999
81
            Pattern.compile("^(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // April 99 or April, 1999 or Apr. 12
82
            Pattern.compile("^(?<day>[0-9]{1,2})([\\.\\-/])(?<month>[0-1]?[0-9])\\2(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12.04.1969 or 12/04/1969 or 12-04-1969
83
            Pattern.compile("^(?<month>[0-1]?[0-9])([\\.\\-/])(?<year>(?:1[7,8,9])?[0-9]{2})$"), // partial date like 04.1969 or 04/1969 or 04-1969
84
            Pattern.compile("^(?<day>[0-9]{1,2})(?:[\\./]|th|rd)?\\s(?<monthName>\\p{L}+\\.?),?\\s(?<year>(?:1[7,8,9])?[0-9]{2})$"), // full date like 12. April 1969 or april 1999 or 22 Dec.1999
79 85
        };
80 86
    private static final Pattern typeSplitPattern =  Pattern.compile("^(?:\"*[Tt]ype: (?<type>.*?))(?:[Hh]olotype:(?<holotype>.*?))?(?:[Ii]sotype[^:]*:(?<isotype>.*))?$");
81 87

  
......
85 91
        String[] fr = new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"};
86 92
        String[] de = new String[]{"januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember"};
87 93
        String[] en = new String[]{"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"};
94
        String[] it = new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"};
95
        String[] sp = new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"};
96
        String[] de_abbrev = new String[]{"jan.", "feb.", "märz", "apr.", "mai", "jun.", "jul.", "aug.", "sept.", "okt.", "nov.", "dez."};
97
        String[] en_abbrev = new String[]{"jan.", "feb.", "mar.", "apr.", "may", "jun.", "jul.", "aug.", "sep.", "oct.", "nov.", "dec."};
98
        String[] port = new String[]{"Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro"};
99
        String[] rom_num = new String[]{"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii"};
88 100

  
89
        String[][] perLang =  new String[][]{ck, de, fr, en};
101
        String[][] perLang =  new String[][]{ck, de, fr, en, it, sp, port, de_abbrev, en_abbrev, rom_num};
90 102

  
91 103
        for (String[] months: perLang) {
92 104
            for(int m = 1; m < 13; m++){
93
                monthFromNameMap.put(months[m - 1], m);
105
                monthFromNameMap.put(months[m - 1].toLowerCase(), m);
94 106
            }
95 107
        }
96 108

  
97 109
        // special cases
98
        monthFromNameMap.put("Mar", 3);
110
        monthFromNameMap.put("mar", 3);
111
        monthFromNameMap.put("dec", 12);
112
        monthFromNameMap.put("Februari", 2);
99 113
    }
100 114

  
101 115
    enum TypesName {
......
123 137

  
124 138
        String line = state.getCurrentLine() + ": ";
125 139

  
140
        String regNumber = getValue(record, REGISTRATIONNO_PK, false);
126 141
        String titleCacheStr = getValue(record, FULLNAME, true);
127 142
        String nameStr = getValue(record, NAMESTRING, true);
128 143
        String authorStr = getValue(record, AUTHORSTRING, true);
......
167 182
                        }
168 183
                        try {
169 184
                            nomRefPupMonthName = m2.group("monthName");
170
                            nomRefPupMonth = monthFromName(nomRefPupMonthName);
185
                            nomRefPupMonth = monthFromName(nomRefPupMonthName, regNumber);
171 186
                        } catch (IllegalArgumentException e){
172 187
                            // named capture group not found
173 188
                        }
......
189 204
                    }
190 205
                }
191 206
                if(nomRefPupYear == null){
192
                    logger.warn("Pub year not found in " + nomRefPupDate + " from " + nomRefStr );
193
                    // FIXME in in J. Eur. Orchideen 30: 128. 30.09.97 (Vorabdr.).
207
                    logger.warn("Pub date not found in [" + regNumber + "]: " + nomRefPupDate + " from " + nomRefStr );
194 208
                }
195 209
                List<DateTimeFieldType> types = new ArrayList<>();
196 210
                List<Integer> values = new ArrayList<>();
......
303 317

  
304 318
    }
305 319

  
306
    private String monthFromName(String monthName) {
320
    private String monthFromName(String monthName, String regNumber) {
307 321

  
308 322
        Integer month = monthFromNameMap.get(monthName.toLowerCase());
309 323
        if(month == null){
310
            logger.warn("Unknown month: " + monthName);
324
            logger.warn("Unknown month [" + regNumber + "]: " + monthName + " (" + monthName.toLowerCase() + ")");
311 325
            return null;
312 326
        } else {
313 327
            return month.toString();

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)