Project

General

Profile

Revision 9d8ab33d

ID9d8ab33d3d9c5a5f47f2e901b838b4dea4cfc207
Parent 307c8d15
Child dcad6203

Added by Patrick Plitzner over 1 year ago

Import categories and states of FloraTerms (PlantGlossary)

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryActivator.java
1 1
package eu.etaxonomy.cdm.io.plantglossary;
2 2

  
3
import java.io.FileInputStream;
4
import java.io.FileNotFoundException;
5
import java.io.InputStreamReader;
3
import java.io.IOException;
4
import java.net.URI;
6 5

  
7 6
import org.apache.log4j.Logger;
8 7

  
......
17 16
	@SuppressWarnings("unused")
18 17
	private static final Logger logger = Logger.getLogger(PlantGlossaryActivator.class);
19 18

  
20
	private void doImport(ICdmDataSource cdmDestination) throws FileNotFoundException{
19
	private void doImport(ICdmDataSource cdmDestination) throws IOException{
21 20

  
22 21
	    /*
23 22
	     * Source file:
24
	     * https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv
23
	     * extracted data from https://terms.tdwg.org
25 24
	     *
26
	     * Cleaning data:
27
	     *  - remove all comments in csv file
28
	     *  - fix "coetaneouser" by adding missing paramater for "remarks" -> "active"
25
	     * Cleaning data with OpenRefine:
26
	     *  - generated URI column
27
	     *  - parsed term description by crawling term html pages (description are not retrieved via web interface)
29 28
	     */
30
        FileInputStream inStream = new FileInputStream("/home/pplitzner/plantglossary.csv");
31
		PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(new InputStreamReader(inStream), cdmDestination);
29
	    URI uri = URI.create("file:/home/pplitzner/projects/Additivity/plant_glossary_states.csv");
30
		PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(uri, cdmDestination);
32 31
		config.setCheck(CHECK.IMPORT_WITHOUT_CHECK);
33 32
		config.setDbSchemaValidation(DbSchemaValidation.VALIDATE);
34 33

  
......
42 41
	public static void main(String[] args) {
43 42
		PlantGlossaryActivator activator = new PlantGlossaryActivator();
44 43
		try {
45
	        ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "additivity", 3306, "root", null);
44
	        ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "empty", 3306, "root", null);
46 45
            activator.doImport(dataSource);
47
        } catch (FileNotFoundException e) {
46
        } catch (IOException e) {
48 47
            e.printStackTrace();
49 48
        }
50 49
	}
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCategoryImport.java
1
/**
2
* Copyright (C) 2017 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.plantglossary;
10

  
11
import java.io.File;
12
import java.net.URI;
13
import java.util.Map;
14

  
15
import org.apache.log4j.Logger;
16
import org.springframework.stereotype.Component;
17

  
18
import eu.etaxonomy.cdm.common.CdmUtils;
19
import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
20
import eu.etaxonomy.cdm.model.common.Annotation;
21
import eu.etaxonomy.cdm.model.common.AnnotationType;
22
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
23
import eu.etaxonomy.cdm.model.common.Language;
24
import eu.etaxonomy.cdm.model.common.OriginalSourceType;
25
import eu.etaxonomy.cdm.model.common.TermType;
26
import eu.etaxonomy.cdm.model.common.TermVocabulary;
27

  
28
/**
29
 *
30
 * @author pplitzner
31
 * @since Dec 7, 2018
32
 *
33
 */
34
@Component
35
public class PlantGlossaryCategoryImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
36
    private static final long serialVersionUID = -5600766240192189822L;
37
    private static Logger logger = Logger.getLogger(PlantGlossaryCategoryImport.class);
38

  
39
    @Override
40
    protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
41
        final String HEADER_LABEL = "rdfs:label";
42
        final String HEADER_DESCRIPTION = "skos:definition";
43
        final String HEADER_URI = "category_URI";
44
        final String HEADER_NOTES = "skos:notes";
45

  
46
        Map<String, String> currentRecord = importState.getCurrentRecord();
47

  
48
        String vocName = currentRecord.get(HEADER_LABEL);
49
        if(CdmUtils.isBlank(vocName)){
50
            // this line does not contain any vocabulary information
51
            return;
52
        }
53

  
54
        TermVocabulary existingVocabulary = importState.checkVocabularies(vocName, getVocabularyService());
55
        if(existingVocabulary!=null){
56
            return;
57
        }
58

  
59
        TermVocabulary<?> stateVoc = TermVocabulary.NewInstance(
60
                TermType.State,
61
                currentRecord.get(HEADER_DESCRIPTION),
62
                vocName,
63
                null,
64
                importState.getCitation().getUri());
65
        stateVoc.setUri(URI.create(currentRecord.get(HEADER_URI)));
66
        stateVoc.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
67

  
68
        importState.addVocabulary(stateVoc);
69

  
70
        stateVoc.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
71

  
72
        getVocabularyService().saveOrUpdate(stateVoc);
73
    }
74

  
75
}
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImport.java
1
/**
2
* Copyright (C) 2017 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.plantglossary;
10

  
11
import java.io.File;
12
import java.util.Map;
13
import java.util.UUID;
14

  
15
import org.apache.log4j.Logger;
16
import org.springframework.stereotype.Component;
17

  
18
import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
19
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
20
import eu.etaxonomy.cdm.model.common.OriginalSourceType;
21
import eu.etaxonomy.cdm.model.common.TermType;
22
import eu.etaxonomy.cdm.model.common.TermVocabulary;
23
import eu.etaxonomy.cdm.model.description.State;
24

  
25
/**
26
 *
27
 * @author pplitzner
28
 * @since Dec 7, 2018
29
 *
30
 */
31
@Component
32
public class PlantGlossaryCsvImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
33
    private static final long serialVersionUID = -5600766240192189822L;
34
    private static Logger logger = Logger.getLogger(PlantGlossaryCsvImport.class);
35

  
36

  
37
    @Override
38
    protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
39
        final String TERM_HEADER = "term";
40
        final String CATEGORY_HEADER = "category";
41
        final String HAS_SYN_HEADER = "hasSyn";
42
        final String SOURCE_HEADER = "sourceDataset";
43
        final String TERM_ID_HEADER = "termID";
44
        final String REMARK_HEADER = "remarks";
45

  
46
        Map<String, String> currentRecord = importState.getCurrentRecord();
47
        if(!currentRecord.get(REMARK_HEADER).equals("active")){
48
            String message = String.format(
49
                    "Line %s has obsolete data and was skipped", importState.getLine());
50
            logger.info(message);
51
            return;
52
        }
53

  
54
        State stateTerm = State.NewInstance(null, currentRecord.get(TERM_HEADER), null);
55
        stateTerm.setUuid(UUID.fromString(currentRecord.get(TERM_ID_HEADER)));
56

  
57
        String vocName = currentRecord.get(CATEGORY_HEADER);
58
        TermVocabulary vocabulary = importState.checkVocabularies(vocName);
59
        if(vocabulary==null){
60
            vocabulary = TermVocabulary.NewInstance(TermType.State, null, vocName, null, null);
61
            importState.addVocabulary(vocabulary);
62
        }
63
        vocabulary.addTerm(stateTerm);
64

  
65

  
66
        stateTerm.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
67

  
68
        getVocabularyService().saveOrUpdate(vocabulary);
69
        getTermService().saveOrUpdate(stateTerm);
70
    }
71

  
72
}
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImportConfigurator.java
8 8
*/
9 9
package eu.etaxonomy.cdm.io.plantglossary;
10 10

  
11
import java.io.InputStreamReader;
11
import java.io.IOException;
12
import java.net.URI;
12 13

  
13 14
import eu.etaxonomy.cdm.database.ICdmDataSource;
14 15
import eu.etaxonomy.cdm.io.csv.in.CsvImportConfiguratorBase;
......
24 25

  
25 26
    private static final long serialVersionUID = 987286481306951779L;
26 27

  
27
    public static PlantGlossaryCsvImportConfigurator NewInstance(InputStreamReader file,
28
            ICdmDataSource cdmDestination) {
29
        return new PlantGlossaryCsvImportConfigurator(file, cdmDestination);
28
    public static PlantGlossaryCsvImportConfigurator NewInstance(URI source,
29
            ICdmDataSource cdmDestination) throws IOException {
30
        return new PlantGlossaryCsvImportConfigurator(source, cdmDestination);
30 31
    }
31 32

  
32 33
// ****************** CONSTRUCTOR *****************************/
33 34

  
34
    private PlantGlossaryCsvImportConfigurator(InputStreamReader file,
35
            ICdmDataSource cdmDestination){
36
        super(file, cdmDestination, null);
35
    private PlantGlossaryCsvImportConfigurator(URI source,
36
            ICdmDataSource cdmDestination) throws IOException{
37
        super(source, cdmDestination, null);
37 38
    }
38 39

  
39 40
// *************************************
......
41 42

  
42 43
    @Override
43 44
    @SuppressWarnings("unchecked")
44
    protected void makeIoClassList(){
45
        ioClassList = new Class[]{
46
            PlantGlossaryCsvImport.class,
47
        };
45
    protected void makeIoClassList() {
46
        ioClassList = new Class[] {
47
                PlantGlossaryCategoryImport.class,
48
                PlantGlossaryStateImport.class };
48 49
    }
49 50

  
50 51
    @Override
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImportState.java
10 10

  
11 11
import java.net.URI;
12 12
import java.net.URISyntaxException;
13
import java.util.ArrayList;
13 14
import java.util.HashSet;
15
import java.util.List;
14 16
import java.util.Set;
15 17

  
18
import eu.etaxonomy.cdm.api.service.ITermService;
19
import eu.etaxonomy.cdm.api.service.IVocabularyService;
16 20
import eu.etaxonomy.cdm.io.csv.in.CsvImportState;
17 21
import eu.etaxonomy.cdm.model.agent.Institution;
18 22
import eu.etaxonomy.cdm.model.agent.Person;
23
import eu.etaxonomy.cdm.model.agent.Team;
19 24
import eu.etaxonomy.cdm.model.common.TermVocabulary;
20 25
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
26
import eu.etaxonomy.cdm.model.description.State;
21 27
import eu.etaxonomy.cdm.model.reference.Reference;
22 28
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
23 29

  
......
29 35
 */
30 36
public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsvImportConfigurator> {
31 37

  
38
    private List<TermVocabulary> existingVocabularies = new ArrayList<>();
39
    private List<State> existingTerms = new ArrayList<>();
32 40
    private Set<TermVocabulary> vocabularies = new HashSet<>();
33 41
    private final Reference citation;
34 42

  
......
36 44
    protected PlantGlossaryCsvImportState(PlantGlossaryCsvImportConfigurator config) {
37 45
        super(config);
38 46
        citation = ReferenceFactory.newGeneric();
39
        citation.setTitle("fna_gloss_final_20130517");
40
        Person authorship = Person.NewInstance(null, "Cui", null, "Hong");
41
        citation.setAuthorship(authorship);
47
        citation.setTitle("FloraTerms");
48
        Team team = Team.NewInstance();
49
        team.addTeamMember(Person.NewInstance(null, "Cui", null, "Hong"));
50
        team.addTeamMember(Person.NewInstance(null, "Cole", null, "Heather"));
51
        team.addTeamMember(Person.NewInstance(null, "Endara", null, "Lorena"));
52
        team.addTeamMember(Person.NewInstance(null, "Macklin", null, "James"));
53
        team.addTeamMember(Person.NewInstance(null, "Sachs", null, "Joel"));
54
        citation.setAuthorship(team);
42 55
        VerbatimTimePeriod datePublished = VerbatimTimePeriod.NewVerbatimInstance();
43 56
        datePublished.setStartYear(2014);
44 57
        datePublished.setStartMonth(6);
45 58
        datePublished.setStartDay(13);
46 59
        citation.setDatePublished(datePublished);
47 60
        Institution institution = Institution.NewNamedInstance("OTO System");
61
        institution.addUrl(URI.create("http://biosemantics.arizona.edu/OTO/"));
48 62
        citation.setInstitution(institution);
49
        citation.setEdition("Version: 0.11");
50 63
        URI uri;
51 64
        try {
52
            uri = new URI("https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv");
65
            uri = new URI("https://terms.tdwg.org/wiki/FloraTerms");
53 66
            citation.setUri(uri);
54 67
        } catch (URISyntaxException e) {
55 68
        }
......
64 77
        vocabularies.add(vocabulary);
65 78
    }
66 79

  
67
    TermVocabulary checkVocabularies(String vocName){
80
    TermVocabulary checkVocabularies(String vocName, IVocabularyService vocabularyService){
81
        if(existingVocabularies.isEmpty()){
82
            existingVocabularies = vocabularyService.list(TermVocabulary.class, null, null, null, null);
83
        }
68 84
        for (TermVocabulary termVocabulary : vocabularies) {
69 85
            if(termVocabulary.getLabel().equals(vocName)){
70 86
                return termVocabulary;
......
73 89
        return null;
74 90
    }
75 91

  
92
    public boolean isTermPresent(String termName, ITermService termService) {
93
        if(existingTerms.isEmpty()){
94
            existingTerms = termService.list(State.class, null, null, null, null);
95
        }
96
        return existingTerms.stream().map(term->term.getLabel()).anyMatch(label->label.equals(termName));
97
    }
98

  
76 99
    Reference getCitation() {
77 100
        return citation;
78 101
    }
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryStateImport.java
1
/**
2
* Copyright (C) 2017 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.plantglossary;
10

  
11
import java.io.File;
12
import java.net.URI;
13
import java.util.Map;
14

  
15
import org.apache.log4j.Logger;
16
import org.springframework.stereotype.Component;
17

  
18
import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
19
import eu.etaxonomy.cdm.model.common.Annotation;
20
import eu.etaxonomy.cdm.model.common.AnnotationType;
21
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
22
import eu.etaxonomy.cdm.model.common.Language;
23
import eu.etaxonomy.cdm.model.common.OriginalSourceType;
24
import eu.etaxonomy.cdm.model.common.TermVocabulary;
25
import eu.etaxonomy.cdm.model.description.State;
26

  
27
/**
28
 *
29
 * @author pplitzner
30
 * @since Dec 7, 2018
31
 *
32
 */
33
@Component
34
public class PlantGlossaryStateImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
35
    private static final long serialVersionUID = -5600766240192189822L;
36
    private static Logger logger = Logger.getLogger(PlantGlossaryStateImport.class);
37

  
38
    final String HEADER_LABEL = "dcterms:identifier";
39
    final String HEADER_DEFINITION = "definition";
40
    final String HEADER_CATEGORY = "vann:termGroup";
41
    final String HEADER_NOTES = "skos:example";
42
    final String SOURCE_HEADER = "sourceDataset";
43
    final String HEADER_URI = "term_URI";
44

  
45
    @Override
46
    protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
47

  
48
        Map<String, String> currentRecord = importState.getCurrentRecord();
49

  
50
        String termLabel = currentRecord.get(HEADER_LABEL);
51
        //check if already present
52
        if(importState.isTermPresent(termLabel, getTermService())){
53
            return;
54
        }
55

  
56
        State stateTerm = State.NewInstance(currentRecord.get(HEADER_DEFINITION), termLabel, null);
57
        stateTerm.setUri(URI.create(currentRecord.get(HEADER_URI)));
58
        stateTerm.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
59

  
60
        String vocName = currentRecord.get(HEADER_CATEGORY);
61
        // TODO how should we handle multiple possible categories?
62
        // for now we just take the first one
63
        if(vocName.contains(",")){
64
            vocName = vocName.split(",")[0];
65
        }
66
        TermVocabulary vocabulary = importState.checkVocabularies(vocName, getVocabularyService());
67
        if(vocabulary==null){
68
            logger.error("No vocabulary found for term: "+stateTerm+" with vocName: "+vocName);
69
            return;
70
        }
71
        vocabulary.addTerm(stateTerm);
72

  
73
        stateTerm.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
74

  
75
        getVocabularyService().saveOrUpdate(vocabulary);
76
        getTermService().saveOrUpdate(stateTerm);
77
    }
78

  
79
}

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)