Project

General

Profile

« Previous | Next » 

Revision 4cebdf13

Added by Andreas Kohlbecker over 7 years ago

#6026 import of taxa and names working

View differences:

app-import/.gitignore
2 2
imports.iml
3 3
src/main/main.iml
4 4
src/test/test.iml
5
error.log
app-import/src/main/java/eu/etaxonomy/cdm/app/iapt/IAPTActivator.java
13 13
import java.net.URI;
14 14
import java.util.UUID;
15 15

  
16
import eu.etaxonomy.cdm.database.DatabaseTypeEnum;
17
import org.apache.log4j.Appender;
16 18
import org.apache.log4j.Logger;
17 19

  
18 20
import eu.etaxonomy.cdm.app.common.CdmDestinations;
......
23 25
import eu.etaxonomy.cdm.io.iapt.IAPTImportConfigurator;
24 26
import eu.etaxonomy.cdm.model.reference.Reference;
25 27
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
28
import org.apache.log4j.RollingFileAppender;
26 29

  
27 30

  
28 31
/**
......
34 37
    private static final Logger logger = Logger.getLogger(IAPTActivator.class);
35 38

  
36 39
    //database validation status (create, update, validate ...)
37
    static DbSchemaValidation hbm2dll = DbSchemaValidation.VALIDATE;
38

  
39
    static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
40
//  static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
41
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_cuba_production();
40
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
41

  
42
    static ICdmDataSource cdmDestination = null;
43
    static {
44
        DatabaseTypeEnum dbType = DatabaseTypeEnum.MySQL;
45
        String cdmServer = "127.0.0.1";
46
        String cdmDB = "cdm_algea_registry";
47
        String cdmUserName = "edit";
48
        cdmDestination =  CdmDestinations.makeDestination(dbType, cdmServer, cdmDB, -1, cdmUserName, null);
49
        // cdmDestination = CdmDestinations.localH2();
50
    }
42 51

  
43 52
    static boolean invers = true;
44 53

  
......
110 119

  
111 120

  
112 121
    public static URI iapt() {
113
        File f = new File("~/data/Projekte/Algea Name Registry/registry/sources/IAPT/Registration_DB_from_BGBM17-cleaned.xls");
122
        File f = new File(System.getProperty("user.home") + "/data/Projekte/Algea Name Registry/registry/sources/IAPT/Registration_DB_from_BGBM17-cleaned.xls");
114 123
        return f.toURI();
115 124
    }
116 125

  
......
118 127
     * @param args
119 128
     */
120 129
    public static void main(String[] args) {
130

  
121 131
        IAPTActivator me = new IAPTActivator();
122 132
        me.doImport(cdmDestination);
123 133
    }
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTExcelImport.java
14 14
import eu.etaxonomy.cdm.io.mexico.SimpleExcelTaxonImportState;
15 15
import eu.etaxonomy.cdm.model.common.*;
16 16
import eu.etaxonomy.cdm.model.name.*;
17
import eu.etaxonomy.cdm.model.reference.INomenclaturalReference;
18 17
import eu.etaxonomy.cdm.model.reference.Reference;
19 18
import eu.etaxonomy.cdm.model.taxon.*;
20
import eu.etaxonomy.cdm.strategy.exceptions.StringNotParsableException;
21
import eu.etaxonomy.cdm.strategy.parser.INonViralNameParser;
22 19
import eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl;
20
import org.apache.commons.lang.StringEscapeUtils;
23 21
import org.apache.commons.lang.StringUtils;
22
import org.apache.log4j.Level;
24 23
import org.apache.log4j.Logger;
25 24
import org.springframework.stereotype.Component;
26 25

  
27 26
import java.util.*;
27
import java.util.regex.Matcher;
28
import java.util.regex.Pattern;
28 29

  
29 30
/**
30 31
 * @author a.mueller
31 32
 * @created 05.01.2016
32 33
 */
33 34

  
34
@Component
35
@Component("iAPTExcelImport")
35 36
public class IAPTExcelImport<CONFIG extends IAPTImportConfigurator> extends SimpleExcelTaxonImport<CONFIG> {
36 37
    private static final long serialVersionUID = -747486709409732371L;
37 38
    private static final Logger logger = Logger.getLogger(IAPTExcelImport.class);
39
    public static final String ANNOTATION_MARKER_STRING = "[*]";
38 40

  
39 41

  
40 42
    private static UUID ROOT_UUID = UUID.fromString("4137fd2a-20f6-4e70-80b9-f296daf51d82");
41 43

  
42
    private static INonViralNameParser<?> nameParser = NonViralNameParserImpl.NewInstance();
44
    private static NonViralNameParserImpl nameParser = NonViralNameParserImpl.NewInstance();
43 45

  
44 46
    private final static String REGISTRATIONNO_PK= "RegistrationNo_Pk";
45 47
    private final static String HIGHERTAXON= "HigherTaxon";
......
61 63
    private  static List<String> expectedKeys= Arrays.asList(new String[]{
62 64
            REGISTRATIONNO_PK, HIGHERTAXON, FULLNAME, AUTHORSSPELLING, LITSTRING, REGISTRATION, TYPE, CAVEATS, FULLBASIONYM, FULLSYNSUBST, NOTESTXT, REGDATE, NAMESTRING, BASIONYMSTRING, SYNSUBSTSTR, AUTHORSTRING});
63 65

  
66
    private static final Pattern nomRefTokenizeP = Pattern.compile("^(.*):\\s([^\\.:]+)\\.(.*)$");
67
    private static final Pattern nomRefPubYearExtractP = Pattern.compile("(.*?)(1[7,8,9][0-9]{2}).*$|^.*?[0-9]{1,2}([\\./])[0-1]?[0-9]\\3([0-9]{2})\\.$"); // 1700 - 1999
64 68

  
65 69
    private Taxon makeTaxon(HashMap<String, String> record, SimpleExcelTaxonImportState<CONFIG> state,
66 70
                            TaxonNode higherTaxonNode, boolean isSynonym) {
67 71

  
68 72
        String line = state.getCurrentLine() + ": ";
69 73

  
70
        String fullNameStr = getValue(record, FULLNAME);
71
        String nameStr = getValue(record, NAMESTRING);
72
        String authorStr = getValue(record, AUTHORSTRING);
74
        String titleCacheStr = getValue(record, FULLNAME, true);
75
        String nameStr = getValue(record, NAMESTRING, true);
76
        String authorStr = getValue(record, AUTHORSTRING, true);
77
        String nomRefStr = getValue(record, LITSTRING, true);
78

  
79
        String nomRefTitle = null;
80
        String nomRefDetail = null;
81
        String nomRefPupDate = null;
82
        String nomRefPupYear = null;
83

  
84
        // preprocess nomRef: separate citation, reference detail, publishing date
85
        if(!StringUtils.isEmpty(nomRefStr)){
86
            nomRefStr = nomRefStr.trim();
87
            Matcher m = nomRefTokenizeP.matcher(nomRefStr);
88
            if(m.matches()){
89
                nomRefTitle = m.group(1);
90
                nomRefDetail = m.group(2);
91
                nomRefPupDate = m.group(3);
92

  
93
                // nomRefDetail.replaceAll("[\\:\\.\\s]", ""); // TODO integrate into nomRefTokenizeP
94
                Matcher m2 = nomRefPubYearExtractP.matcher(nomRefPupDate);
95
                if(m2.matches()){
96
                    nomRefPupYear = m2.group(2);
97
                    if(nomRefPupYear.length() == 2 ){
98
                        // it is an abbreviated year from the 19** years
99
                        nomRefPupYear = "19" + nomRefPupYear;
100
                    }
101
                    nomRefTitle = nomRefTitle + ": " + nomRefDetail + ". " + nomRefPupYear + ".";
102
                } else {
103
                    logger.warn("Pub year not found in " + nomRefStr );
104
                }
105

  
106
            } else {
107
                nomRefTitle = nomRefStr;
108
            }
109
        }
110

  
111

  
112
        BotanicalName taxonName;
113
        Map<String, AnnotationType> nameAnnotations = new HashMap<>();
73 114

  
74
        String sourceReference = getValue(record, LITSTRING);
115
        if(titleCacheStr.endsWith(ANNOTATION_MARKER_STRING) && authorStr.endsWith(ANNOTATION_MARKER_STRING)){
116
            nameAnnotations.put("Author abbreviation not checked.", AnnotationType.EDITORIAL());
117
            titleCacheStr = titleCacheStr.replace(ANNOTATION_MARKER_STRING, "").trim();
118
            authorStr = authorStr.replace(ANNOTATION_MARKER_STRING, "").trim();
119
        }
120

  
121
        if(!StringUtils.isEmpty(nomRefTitle)){
122
            String referenceSeparator = nomRefTitle.startsWith("in ") ? " " : ", ";
123
            String taxonFullNameStr = titleCacheStr + referenceSeparator + nomRefTitle;
124
            logger.debug(":::::" + taxonFullNameStr);
125
            taxonName = (BotanicalName) nameParser.parseReferencedName(taxonFullNameStr, NomenclaturalCode.ICNAFP, null);
126
        } else {
127
            taxonName = (BotanicalName) nameParser.parseFullName(titleCacheStr, NomenclaturalCode.ICNAFP, null);
128
        }
75 129

  
76
        BotanicalName taxonName = (BotanicalName) nameParser.parseFullName(fullNameStr, NomenclaturalCode.ICNAFP, null);
77 130
        if (taxonName.isProtectedTitleCache()) {
78
            logger.warn(line + "Name could not be parsed: " + fullNameStr);
131
            logger.warn(line + "Name could not be parsed: " + titleCacheStr);
79 132
        } else {
133

  
134
            boolean doRestoreTitleCacheStr = false;
135
            // Check titleCache
136
            String generatedTitleCache = taxonName.getTitleCache();
137
            if (!generatedTitleCache.trim().equals(titleCacheStr)) {
138
                logger.warn(line + "The generated titleCache differs from the imported string : " + generatedTitleCache + " <> " + titleCacheStr + " will restore original titleCacheStr");
139
                doRestoreTitleCacheStr = true;
140
            }
80 141
            // Check Name
81
            if (!taxonName.getNameCache().equals(nameStr)) {
142
            if (!taxonName.getNameCache().trim().equals(nameStr)) {
82 143
                logger.warn(line + "parsed nameCache differs from " + NAMESTRING + " : " + taxonName.getNameCache() + " <> " + nameStr);
83 144
            }
84
            // Check Author
85
            INomenclaturalReference nomRef = taxonName.getNomenclaturalReference();
86
            if (!nomRef.getAuthorship().getTitleCache().equals(authorStr)) {
87
                logger.warn(line + "parsed nomRef.authorship differs from " + AUTHORSTRING + " : " + nomRef.getAuthorship().getTitleCache() + " <> " + authorStr);
88
                // preserve current titleCache
89
                taxonName.setProtectedTitleCache(true);
90
                try {
91
                    nameParser.parseAuthors(taxonName, authorStr);
92
                } catch (StringNotParsableException e) {
93
                    logger.error("    " + authorStr + " can not be parsed");
94
                }
145

  
146
            //  Author
147
            //nameParser.handleAuthors(taxonName, titleCacheStr, authorStr);
148
            //if (!titleCacheStr.equals(taxonName.getTitleCache())) {
149
            //    logger.warn(line + "titleCache has changed after setting authors, will restore original titleCacheStr");
150
            //    doRestoreTitleCacheStr = true;
151
            //}
152

  
153
            if(doRestoreTitleCacheStr){
154
                taxonName.setTitleCache(titleCacheStr, true);
95 155
            }
96 156

  
97 157
            // deduplicate
98 158
            replaceAuthorNamesAndNomRef(state, taxonName);
159

  
160
            // Annotations
161
            if(!nameAnnotations.isEmpty()){
162
                for(String text : nameAnnotations.keySet()){
163
                    taxonName.addAnnotation(Annotation.NewInstance(text, nameAnnotations.get(text), Language.DEFAULT()));
164
                }
165
                getNameService().save(taxonName);
166
            }
99 167
        }
100 168

  
101 169
        Reference sec = state.getConfig().getSecReference();
......
150 218
    /**
151 219
     * @param record
152 220
     * @param originalKey
221
     * @param doUnescapeHtmlEntities
153 222
     * @return
154 223
     */
155
    private String getValue(HashMap<String, String> record, String originalKey) {
224
    private String getValue(HashMap<String, String> record, String originalKey, boolean doUnescapeHtmlEntities) {
156 225
        String value = record.get(originalKey);
157 226
        if (! StringUtils.isBlank(value)) {
158
        	if (logger.isDebugEnabled()) { logger.debug(originalKey + ": " + value); }
227
        	if (logger.isDebugEnabled()) {
228
        	    logger.debug(originalKey + ": " + value);
229
        	}
159 230
        	value = CdmUtils.removeDuplicateWhitespace(value.trim()).toString();
160
        	return value;
231
            if(doUnescapeHtmlEntities){
232
                value = StringEscapeUtils.unescapeHtml(value);
233
            }
234
        	return value.trim();
161 235
        }else{
162 236
        	return null;
163 237
        }
......
174 248
	    boolean isSynonymOnly = false;
175 249

  
176 250
        String line = state.getCurrentLine() + ": ";
251
        logger.setLevel(Level.DEBUG);
177 252
        HashMap<String, String> record = state.getOriginalRecord();
253
        logger.debug(record.toString());
178 254

  
179 255
        Set<String> keys = record.keySet();
180 256
        for (String key: keys) {
......
183 259
            }
184 260
        }
185 261

  
262
        String reg_id = record.get(REGISTRATIONNO_PK);
186 263
        //higherTaxon
187 264
        TaxonNode higherTaxon = getHigherTaxon(record, (IAPTImportState)state);
188 265

  
......
219 296

  
220 297
        ITaxonTreeNode rootNode = getClassification(state);
221 298
        for (String htn :  higherTaxaNames) {
222
            htn = htn.trim();
299
            htn = StringUtils.capitalize(htn.trim());
223 300
            Taxon higherTaxon = state.getHigherTaxon(htn);
224 301
            if (higherTaxon != null){
225 302
                higherTaxonNode = higherTaxon.getTaxonNodes().iterator().next();
......
227 304
                BotanicalName name = makeHigherTaxonName(state, htn);
228 305
                Reference sec = state.getSecReference();
229 306
                higherTaxon = Taxon.NewInstance(name, sec);
307
                getTaxonService().save(higherTaxon);
230 308
                higherTaxonNode = rootNode.addChildTaxon(higherTaxon, sec, null);
231 309
                state.putHigherTaxon(htn, higherTaxon);
232
                rootNode = higherTaxonNode;
310
                getClassificationService().saveTreeNode(higherTaxonNode);
233 311
            }
312
            rootNode = higherTaxonNode;
234 313
        }
235 314
        return higherTaxonNode;
236 315
    }
237 316

  
238 317
    private BotanicalName makeHigherTaxonName(IAPTImportState state, String name) {
239
        // Abteilung: -phyta (bei Pflanzen), -mycota (bei Pilzen)
240
        // Unterabteilung: -phytina (bei Pflanzen), -mycotina (bei Pilzen)
241
        // Klasse: -opsida (bei Pflanzen), -phyceae (bei Algen), -mycetes (bei Pilzen)
242
        // Unterklasse: -idae (bei Pflanzen), -phycidae (bei Algen), -mycetidae (bei Pilzen)
243
        // Ordnung: -ales
244
        // Unterordnung: -ineae
245
        // Familie: -aceae
246
        // Unterfamilie: -oideae
247
        // Tribus: -eae
248
        // Subtribus: -inae
249
        Rank rank = Rank.UNKNOWN_RANK();
250
        if(name.matches("phyta$|mycota$")){
251
            rank = Rank.SECTION_BOTANY();
252
        } else if(name.matches("phytina$|mycotina$")){
253
            rank = Rank.SUBSECTION_BOTANY();
254
        } else if(name.matches("opsida$|phyceae$|mycetes$")){
255
            rank = Rank.CLASS();
256
        } else if(name.matches("idae$|phycidae$|mycetidae$")){
257
            rank = Rank.SUBCLASS();
258
        } else if(name.matches("ales$")){
259
            rank = Rank.ORDER();
260
        } else if(name.matches("ineae$")){
261
            rank = Rank.SUBORDER();
262
        } else if(name.matches("aceae$")){
263
            rank = Rank.FAMILY();
264
        } else if(name.matches("oideae$")){
265
            rank = Rank.SUBFAMILY();
266
        } else if(name.matches("eae$")){
267
            rank = Rank.TRIBE();
268
        } else if(name.matches("inae$")){
269
            rank = Rank.SUBTRIBE();
270
        }
318

  
319
        Rank rank = guessRank(name);
271 320

  
272 321
        BotanicalName taxonName = BotanicalName.NewInstance(rank);
273 322
        taxonName.addSource(makeOriginalSource(state));
......
275 324
        return taxonName;
276 325
    }
277 326

  
327
    private Rank guessRank(String name) {
328

  
329
        // normalize
330
        name = name.replaceAll("\\(.*\\)", "").trim();
331

  
332
        if(name.matches("^Plantae$|^Fungi$|^Musci$")){
333
           return Rank.KINGDOM();
334
        } else if(name.matches(".*incertae sedis$|^Fossil no group assigned$")){
335
           return Rank.FAMILY();
336
        } else if(name.matches(".*phyta$|.*mycota$")){
337
           return Rank.SECTION_BOTANY();
338
        } else if(name.matches(".*phytina$|.*mycotina$")){
339
           return Rank.SUBSECTION_BOTANY();
340
        } else if(name.matches(".*opsida$|.*phyceae$|.*mycetes$|.*ones$")){
341
           return Rank.CLASS();
342
        } else if(name.matches(".*idae$|.*phycidae$|.*mycetidae$")){
343
           return Rank.SUBCLASS();
344
        } else if(name.matches(".*ales$")){
345
           return Rank.ORDER();
346
        } else if(name.matches(".*ineae$")){
347
           return Rank.SUBORDER();
348
        } else if(name.matches(".*oideae$")){
349
           return Rank.SUBFAMILY();
350
        } else if(name.matches(".*eae$")){
351
           return Rank.TRIBE();
352
        } else if(name.matches(".*inae$")){
353
           return Rank.SUBTRIBE();
354
        } else if(name.matches(".*ae$")){
355
           return Rank.FAMILY();
356
        }
357
        return Rank.UNKNOWN_RANK();
358
    }
359

  
278 360

  
279 361
    /**
280 362
     * @param state
app-import/src/main/java/eu/etaxonomy/cdm/io/iapt/IAPTImportConfigurator.java
49 49

  
50 50
    @Override
51 51
    public ImportStateBase getNewState() {
52
        return new SimpleExcelTaxonImportState<>(this);
52
        return new IAPTImportState(this);
53 53
    }
54 54

  
55 55
    @Override

Also available in: Unified diff