Import categories and states of FloraTerms (PlantGlossary)
authorPatrick Plitzner <p.plitzner@bgbm.org>
Fri, 18 Jan 2019 12:22:52 +0000 (13:22 +0100)
committerPatrick Plitzner <p.plitzner@bgbm.org>
Fri, 18 Jan 2019 12:22:52 +0000 (13:22 +0100)
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryActivator.java
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCategoryImport.java [new file with mode: 0644]
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImportConfigurator.java
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImportState.java
app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryStateImport.java [moved from app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImport.java with 50% similarity]

index 7d28e7d26cc1423c2a8f7295fe3ed7e4aad57672..40258e632e336bb198aaf197eb0d9bdc3bbd3723 100644 (file)
@@ -1,8 +1,7 @@
 package eu.etaxonomy.cdm.io.plantglossary;
 
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.InputStreamReader;
+import java.io.IOException;
+import java.net.URI;
 
 import org.apache.log4j.Logger;
 
@@ -17,18 +16,18 @@ public class PlantGlossaryActivator {
        @SuppressWarnings("unused")
        private static final Logger logger = Logger.getLogger(PlantGlossaryActivator.class);
 
-       private void doImport(ICdmDataSource cdmDestination) throws FileNotFoundException{
+       private void doImport(ICdmDataSource cdmDestination) throws IOException{
 
            /*
             * Source file:
-            * https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv
+            * extracted data from https://terms.tdwg.org
             *
-            * Cleaning data:
-            *  - remove all comments in csv file
-            *  - fix "coetaneouser" by adding missing paramater for "remarks" -> "active"
+            * Cleaning data with OpenRefine:
+            *  - generated URI column
+            *  - parsed term description by crawling term html pages (description are not retrieved via web interface)
             */
-        FileInputStream inStream = new FileInputStream("/home/pplitzner/plantglossary.csv");
-               PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(new InputStreamReader(inStream), cdmDestination);
+           URI uri = URI.create("file:/home/pplitzner/projects/Additivity/plant_glossary_states.csv");
+               PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(uri, cdmDestination);
                config.setCheck(CHECK.IMPORT_WITHOUT_CHECK);
                config.setDbSchemaValidation(DbSchemaValidation.VALIDATE);
 
@@ -42,9 +41,9 @@ public class PlantGlossaryActivator {
        public static void main(String[] args) {
                PlantGlossaryActivator activator = new PlantGlossaryActivator();
                try {
-               ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "additivity", 3306, "root", null);
+               ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "empty", 3306, "root", null);
             activator.doImport(dataSource);
-        } catch (FileNotFoundException e) {
+        } catch (IOException e) {
             e.printStackTrace();
         }
        }
diff --git a/app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCategoryImport.java b/app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCategoryImport.java
new file mode 100644 (file)
index 0000000..1c5a046
--- /dev/null
@@ -0,0 +1,75 @@
+/**
+* Copyright (C) 2017 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.io.plantglossary;
+
+import java.io.File;
+import java.net.URI;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.springframework.stereotype.Component;
+
+import eu.etaxonomy.cdm.common.CdmUtils;
+import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
+import eu.etaxonomy.cdm.model.common.Annotation;
+import eu.etaxonomy.cdm.model.common.AnnotationType;
+import eu.etaxonomy.cdm.model.common.IdentifiableSource;
+import eu.etaxonomy.cdm.model.common.Language;
+import eu.etaxonomy.cdm.model.common.OriginalSourceType;
+import eu.etaxonomy.cdm.model.common.TermType;
+import eu.etaxonomy.cdm.model.common.TermVocabulary;
+
+/**
+ *
+ * @author pplitzner
+ * @since Dec 7, 2018
+ *
+ */
+@Component
+public class PlantGlossaryCategoryImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
+    private static final long serialVersionUID = -5600766240192189822L;
+    private static Logger logger = Logger.getLogger(PlantGlossaryCategoryImport.class);
+
+    @Override
+    protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
+        final String HEADER_LABEL = "rdfs:label";
+        final String HEADER_DESCRIPTION = "skos:definition";
+        final String HEADER_URI = "category_URI";
+        final String HEADER_NOTES = "skos:notes";
+
+        Map<String, String> currentRecord = importState.getCurrentRecord();
+
+        String vocName = currentRecord.get(HEADER_LABEL);
+        if(CdmUtils.isBlank(vocName)){
+            // this line does not contain any vocabulary information
+            return;
+        }
+
+        TermVocabulary existingVocabulary = importState.checkVocabularies(vocName, getVocabularyService());
+        if(existingVocabulary!=null){
+            return;
+        }
+
+        TermVocabulary<?> stateVoc = TermVocabulary.NewInstance(
+                TermType.State,
+                currentRecord.get(HEADER_DESCRIPTION),
+                vocName,
+                null,
+                importState.getCitation().getUri());
+        stateVoc.setUri(URI.create(currentRecord.get(HEADER_URI)));
+        stateVoc.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
+
+        importState.addVocabulary(stateVoc);
+
+        stateVoc.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
+
+        getVocabularyService().saveOrUpdate(stateVoc);
+    }
+
+}
\ No newline at end of file
index 4952cfb4d8bbcb945ea8063b5d70f4f816ba1961..1c6dd30c7ab3cfa457caad16a01e467b5fcf3c1b 100644 (file)
@@ -8,7 +8,8 @@
 */
 package eu.etaxonomy.cdm.io.plantglossary;
 
-import java.io.InputStreamReader;
+import java.io.IOException;
+import java.net.URI;
 
 import eu.etaxonomy.cdm.database.ICdmDataSource;
 import eu.etaxonomy.cdm.io.csv.in.CsvImportConfiguratorBase;
@@ -24,16 +25,16 @@ public class PlantGlossaryCsvImportConfigurator
 
     private static final long serialVersionUID = 987286481306951779L;
 
-    public static PlantGlossaryCsvImportConfigurator NewInstance(InputStreamReader file,
-            ICdmDataSource cdmDestination) {
-        return new PlantGlossaryCsvImportConfigurator(file, cdmDestination);
+    public static PlantGlossaryCsvImportConfigurator NewInstance(URI source,
+            ICdmDataSource cdmDestination) throws IOException {
+        return new PlantGlossaryCsvImportConfigurator(source, cdmDestination);
     }
 
 // ****************** CONSTRUCTOR *****************************/
 
-    private PlantGlossaryCsvImportConfigurator(InputStreamReader file,
-            ICdmDataSource cdmDestination){
-        super(file, cdmDestination, null);
+    private PlantGlossaryCsvImportConfigurator(URI source,
+            ICdmDataSource cdmDestination) throws IOException{
+        super(source, cdmDestination, null);
     }
 
 // *************************************
@@ -41,10 +42,10 @@ public class PlantGlossaryCsvImportConfigurator
 
     @Override
     @SuppressWarnings("unchecked")
-    protected void makeIoClassList(){
-        ioClassList = new Class[]{
-            PlantGlossaryCsvImport.class,
-        };
+    protected void makeIoClassList() {
+        ioClassList = new Class[] {
+                PlantGlossaryCategoryImport.class,
+                PlantGlossaryStateImport.class };
     }
 
     @Override
index 2bc44e383bd5bc1de2f64a30f3e1458971adc7e4..c8b26e40b6b37070c1bdaadc2309369f0cbab4b7 100644 (file)
@@ -10,14 +10,20 @@ package eu.etaxonomy.cdm.io.plantglossary;
 
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
+import eu.etaxonomy.cdm.api.service.ITermService;
+import eu.etaxonomy.cdm.api.service.IVocabularyService;
 import eu.etaxonomy.cdm.io.csv.in.CsvImportState;
 import eu.etaxonomy.cdm.model.agent.Institution;
 import eu.etaxonomy.cdm.model.agent.Person;
+import eu.etaxonomy.cdm.model.agent.Team;
 import eu.etaxonomy.cdm.model.common.TermVocabulary;
 import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
+import eu.etaxonomy.cdm.model.description.State;
 import eu.etaxonomy.cdm.model.reference.Reference;
 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
 
@@ -29,6 +35,8 @@ import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
  */
 public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsvImportConfigurator> {
 
+    private List<TermVocabulary> existingVocabularies = new ArrayList<>();
+    private List<State> existingTerms = new ArrayList<>();
     private Set<TermVocabulary> vocabularies = new HashSet<>();
     private final Reference citation;
 
@@ -36,20 +44,25 @@ public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsv
     protected PlantGlossaryCsvImportState(PlantGlossaryCsvImportConfigurator config) {
         super(config);
         citation = ReferenceFactory.newGeneric();
-        citation.setTitle("fna_gloss_final_20130517");
-        Person authorship = Person.NewInstance(null, "Cui", null, "Hong");
-        citation.setAuthorship(authorship);
+        citation.setTitle("FloraTerms");
+        Team team = Team.NewInstance();
+        team.addTeamMember(Person.NewInstance(null, "Cui", null, "Hong"));
+        team.addTeamMember(Person.NewInstance(null, "Cole", null, "Heather"));
+        team.addTeamMember(Person.NewInstance(null, "Endara", null, "Lorena"));
+        team.addTeamMember(Person.NewInstance(null, "Macklin", null, "James"));
+        team.addTeamMember(Person.NewInstance(null, "Sachs", null, "Joel"));
+        citation.setAuthorship(team);
         VerbatimTimePeriod datePublished = VerbatimTimePeriod.NewVerbatimInstance();
         datePublished.setStartYear(2014);
         datePublished.setStartMonth(6);
         datePublished.setStartDay(13);
         citation.setDatePublished(datePublished);
         Institution institution = Institution.NewNamedInstance("OTO System");
+        institution.addUrl(URI.create("http://biosemantics.arizona.edu/OTO/"));
         citation.setInstitution(institution);
-        citation.setEdition("Version: 0.11");
         URI uri;
         try {
-            uri = new URI("https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv");
+            uri = new URI("https://terms.tdwg.org/wiki/FloraTerms");
             citation.setUri(uri);
         } catch (URISyntaxException e) {
         }
@@ -64,7 +77,10 @@ public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsv
         vocabularies.add(vocabulary);
     }
 
-    TermVocabulary checkVocabularies(String vocName){
+    TermVocabulary checkVocabularies(String vocName, IVocabularyService vocabularyService){
+        if(existingVocabularies.isEmpty()){
+            existingVocabularies = vocabularyService.list(TermVocabulary.class, null, null, null, null);
+        }
         for (TermVocabulary termVocabulary : vocabularies) {
             if(termVocabulary.getLabel().equals(vocName)){
                 return termVocabulary;
@@ -73,6 +89,13 @@ public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsv
         return null;
     }
 
+    public boolean isTermPresent(String termName, ITermService termService) {
+        if(existingTerms.isEmpty()){
+            existingTerms = termService.list(State.class, null, null, null, null);
+        }
+        return existingTerms.stream().map(term->term.getLabel()).anyMatch(label->label.equals(termName));
+    }
+
     Reference getCitation() {
         return citation;
     }
similarity index 50%
rename from app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryCsvImport.java
rename to app-import/src/main/java/eu/etaxonomy/cdm/io/plantglossary/PlantGlossaryStateImport.java
index e78769d1374c1b911e34ae87e20b6100b4f74943..78b8867e55ba62fbd02ec90891e084eeed9061d2 100644 (file)
@@ -9,16 +9,18 @@
 package eu.etaxonomy.cdm.io.plantglossary;
 
 import java.io.File;
+import java.net.URI;
 import java.util.Map;
-import java.util.UUID;
 
 import org.apache.log4j.Logger;
 import org.springframework.stereotype.Component;
 
 import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
+import eu.etaxonomy.cdm.model.common.Annotation;
+import eu.etaxonomy.cdm.model.common.AnnotationType;
 import eu.etaxonomy.cdm.model.common.IdentifiableSource;
+import eu.etaxonomy.cdm.model.common.Language;
 import eu.etaxonomy.cdm.model.common.OriginalSourceType;
-import eu.etaxonomy.cdm.model.common.TermType;
 import eu.etaxonomy.cdm.model.common.TermVocabulary;
 import eu.etaxonomy.cdm.model.description.State;
 
@@ -29,40 +31,45 @@ import eu.etaxonomy.cdm.model.description.State;
  *
  */
 @Component
-public class PlantGlossaryCsvImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
+public class PlantGlossaryStateImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
     private static final long serialVersionUID = -5600766240192189822L;
-    private static Logger logger = Logger.getLogger(PlantGlossaryCsvImport.class);
+    private static Logger logger = Logger.getLogger(PlantGlossaryStateImport.class);
 
+    final String HEADER_LABEL = "dcterms:identifier";
+    final String HEADER_DEFINITION = "definition";
+    final String HEADER_CATEGORY = "vann:termGroup";
+    final String HEADER_NOTES = "skos:example";
+    final String SOURCE_HEADER = "sourceDataset";
+    final String HEADER_URI = "term_URI";
 
     @Override
     protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
-        final String TERM_HEADER = "term";
-        final String CATEGORY_HEADER = "category";
-        final String HAS_SYN_HEADER = "hasSyn";
-        final String SOURCE_HEADER = "sourceDataset";
-        final String TERM_ID_HEADER = "termID";
-        final String REMARK_HEADER = "remarks";
 
         Map<String, String> currentRecord = importState.getCurrentRecord();
-        if(!currentRecord.get(REMARK_HEADER).equals("active")){
-            String message = String.format(
-                    "Line %s has obsolete data and was skipped", importState.getLine());
-            logger.info(message);
+
+        String termLabel = currentRecord.get(HEADER_LABEL);
+        //check if already present
+        if(importState.isTermPresent(termLabel, getTermService())){
             return;
         }
 
-        State stateTerm = State.NewInstance(null, currentRecord.get(TERM_HEADER), null);
-        stateTerm.setUuid(UUID.fromString(currentRecord.get(TERM_ID_HEADER)));
+        State stateTerm = State.NewInstance(currentRecord.get(HEADER_DEFINITION), termLabel, null);
+        stateTerm.setUri(URI.create(currentRecord.get(HEADER_URI)));
+        stateTerm.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
 
-        String vocName = currentRecord.get(CATEGORY_HEADER);
-        TermVocabulary vocabulary = importState.checkVocabularies(vocName);
+        String vocName = currentRecord.get(HEADER_CATEGORY);
+        // TODO how should we handle multiple possible categories?
+        // for now we just take the first one
+        if(vocName.contains(",")){
+            vocName = vocName.split(",")[0];
+        }
+        TermVocabulary vocabulary = importState.checkVocabularies(vocName, getVocabularyService());
         if(vocabulary==null){
-            vocabulary = TermVocabulary.NewInstance(TermType.State, null, vocName, null, null);
-            importState.addVocabulary(vocabulary);
+            logger.error("No vocabulary found for term: "+stateTerm+" with vocName: "+vocName);
+            return;
         }
         vocabulary.addTerm(stateTerm);
 
-
         stateTerm.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
 
         getVocabularyService().saveOrUpdate(vocabulary);