package eu.etaxonomy.cdm.io.plantglossary;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.InputStreamReader;
+import java.io.IOException;
+import java.net.URI;
import org.apache.log4j.Logger;
@SuppressWarnings("unused")
private static final Logger logger = Logger.getLogger(PlantGlossaryActivator.class);
- private void doImport(ICdmDataSource cdmDestination) throws FileNotFoundException{
+ private void doImport(ICdmDataSource cdmDestination) throws IOException{
/*
* Source file:
- * https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv
+ * extracted data from https://terms.tdwg.org
*
- * Cleaning data:
- * - remove all comments in csv file
- * - fix "coetaneouser" by adding missing paramater for "remarks" -> "active"
+ * Cleaning data with OpenRefine:
+ * - generated URI column
+ * - parsed term description by crawling term html pages (description are not retrieved via web interface)
*/
- FileInputStream inStream = new FileInputStream("/home/pplitzner/plantglossary.csv");
- PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(new InputStreamReader(inStream), cdmDestination);
+ URI uri = URI.create("file:/home/pplitzner/projects/Additivity/plant_glossary_states.csv");
+ PlantGlossaryCsvImportConfigurator config = PlantGlossaryCsvImportConfigurator.NewInstance(uri, cdmDestination);
config.setCheck(CHECK.IMPORT_WITHOUT_CHECK);
config.setDbSchemaValidation(DbSchemaValidation.VALIDATE);
public static void main(String[] args) {
PlantGlossaryActivator activator = new PlantGlossaryActivator();
try {
- ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "additivity", 3306, "root", null);
+ ICdmDataSource dataSource = CdmDestinations.makeDestination(DatabaseTypeEnum.MySQL, "127.0.0.1", "empty", 3306, "root", null);
activator.doImport(dataSource);
- } catch (FileNotFoundException e) {
+ } catch (IOException e) {
e.printStackTrace();
}
}
--- /dev/null
+/**
+* Copyright (C) 2017 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.io.plantglossary;
+
+import java.io.File;
+import java.net.URI;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.springframework.stereotype.Component;
+
+import eu.etaxonomy.cdm.common.CdmUtils;
+import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
+import eu.etaxonomy.cdm.model.common.Annotation;
+import eu.etaxonomy.cdm.model.common.AnnotationType;
+import eu.etaxonomy.cdm.model.common.IdentifiableSource;
+import eu.etaxonomy.cdm.model.common.Language;
+import eu.etaxonomy.cdm.model.common.OriginalSourceType;
+import eu.etaxonomy.cdm.model.common.TermType;
+import eu.etaxonomy.cdm.model.common.TermVocabulary;
+
+/**
+ *
+ * @author pplitzner
+ * @since Dec 7, 2018
+ *
+ */
+@Component
+public class PlantGlossaryCategoryImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
+ private static final long serialVersionUID = -5600766240192189822L;
+ private static Logger logger = Logger.getLogger(PlantGlossaryCategoryImport.class);
+
+ @Override
+ protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
+ final String HEADER_LABEL = "rdfs:label";
+ final String HEADER_DESCRIPTION = "skos:definition";
+ final String HEADER_URI = "category_URI";
+ final String HEADER_NOTES = "skos:notes";
+
+ Map<String, String> currentRecord = importState.getCurrentRecord();
+
+ String vocName = currentRecord.get(HEADER_LABEL);
+ if(CdmUtils.isBlank(vocName)){
+ // this line does not contain any vocabulary information
+ return;
+ }
+
+ TermVocabulary existingVocabulary = importState.checkVocabularies(vocName, getVocabularyService());
+ if(existingVocabulary!=null){
+ return;
+ }
+
+ TermVocabulary<?> stateVoc = TermVocabulary.NewInstance(
+ TermType.State,
+ currentRecord.get(HEADER_DESCRIPTION),
+ vocName,
+ null,
+ importState.getCitation().getUri());
+ stateVoc.setUri(URI.create(currentRecord.get(HEADER_URI)));
+ stateVoc.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
+
+ importState.addVocabulary(stateVoc);
+
+ stateVoc.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
+
+ getVocabularyService().saveOrUpdate(stateVoc);
+ }
+
+}
\ No newline at end of file
*/
package eu.etaxonomy.cdm.io.plantglossary;
-import java.io.InputStreamReader;
+import java.io.IOException;
+import java.net.URI;
import eu.etaxonomy.cdm.database.ICdmDataSource;
import eu.etaxonomy.cdm.io.csv.in.CsvImportConfiguratorBase;
private static final long serialVersionUID = 987286481306951779L;
- public static PlantGlossaryCsvImportConfigurator NewInstance(InputStreamReader file,
- ICdmDataSource cdmDestination) {
- return new PlantGlossaryCsvImportConfigurator(file, cdmDestination);
+ public static PlantGlossaryCsvImportConfigurator NewInstance(URI source,
+ ICdmDataSource cdmDestination) throws IOException {
+ return new PlantGlossaryCsvImportConfigurator(source, cdmDestination);
}
// ****************** CONSTRUCTOR *****************************/
- private PlantGlossaryCsvImportConfigurator(InputStreamReader file,
- ICdmDataSource cdmDestination){
- super(file, cdmDestination, null);
+ private PlantGlossaryCsvImportConfigurator(URI source,
+ ICdmDataSource cdmDestination) throws IOException{
+ super(source, cdmDestination, null);
}
// *************************************
@Override
@SuppressWarnings("unchecked")
- protected void makeIoClassList(){
- ioClassList = new Class[]{
- PlantGlossaryCsvImport.class,
- };
+ protected void makeIoClassList() {
+ ioClassList = new Class[] {
+ PlantGlossaryCategoryImport.class,
+ PlantGlossaryStateImport.class };
}
@Override
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
+import eu.etaxonomy.cdm.api.service.ITermService;
+import eu.etaxonomy.cdm.api.service.IVocabularyService;
import eu.etaxonomy.cdm.io.csv.in.CsvImportState;
import eu.etaxonomy.cdm.model.agent.Institution;
import eu.etaxonomy.cdm.model.agent.Person;
+import eu.etaxonomy.cdm.model.agent.Team;
import eu.etaxonomy.cdm.model.common.TermVocabulary;
import eu.etaxonomy.cdm.model.common.VerbatimTimePeriod;
+import eu.etaxonomy.cdm.model.description.State;
import eu.etaxonomy.cdm.model.reference.Reference;
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
*/
public class PlantGlossaryCsvImportState extends CsvImportState<PlantGlossaryCsvImportConfigurator> {
+ private List<TermVocabulary> existingVocabularies = new ArrayList<>();
+ private List<State> existingTerms = new ArrayList<>();
private Set<TermVocabulary> vocabularies = new HashSet<>();
private final Reference citation;
protected PlantGlossaryCsvImportState(PlantGlossaryCsvImportConfigurator config) {
super(config);
citation = ReferenceFactory.newGeneric();
- citation.setTitle("fna_gloss_final_20130517");
- Person authorship = Person.NewInstance(null, "Cui", null, "Hong");
- citation.setAuthorship(authorship);
+ citation.setTitle("FloraTerms");
+ Team team = Team.NewInstance();
+ team.addTeamMember(Person.NewInstance(null, "Cui", null, "Hong"));
+ team.addTeamMember(Person.NewInstance(null, "Cole", null, "Heather"));
+ team.addTeamMember(Person.NewInstance(null, "Endara", null, "Lorena"));
+ team.addTeamMember(Person.NewInstance(null, "Macklin", null, "James"));
+ team.addTeamMember(Person.NewInstance(null, "Sachs", null, "Joel"));
+ citation.setAuthorship(team);
VerbatimTimePeriod datePublished = VerbatimTimePeriod.NewVerbatimInstance();
datePublished.setStartYear(2014);
datePublished.setStartMonth(6);
datePublished.setStartDay(13);
citation.setDatePublished(datePublished);
Institution institution = Institution.NewNamedInstance("OTO System");
+ institution.addUrl(URI.create("http://biosemantics.arizona.edu/OTO/"));
citation.setInstitution(institution);
- citation.setEdition("Version: 0.11");
URI uri;
try {
- uri = new URI("https://github.com/biosemantics/glossaries/blob/925f2c1691ed00bf2b9a9cd7f83609cffae47145/Plant/0.11/Plant_glossary_term_category.csv");
+ uri = new URI("https://terms.tdwg.org/wiki/FloraTerms");
citation.setUri(uri);
} catch (URISyntaxException e) {
}
vocabularies.add(vocabulary);
}
- TermVocabulary checkVocabularies(String vocName){
+ TermVocabulary checkVocabularies(String vocName, IVocabularyService vocabularyService){
+ if(existingVocabularies.isEmpty()){
+ existingVocabularies = vocabularyService.list(TermVocabulary.class, null, null, null, null);
+ }
for (TermVocabulary termVocabulary : vocabularies) {
if(termVocabulary.getLabel().equals(vocName)){
return termVocabulary;
return null;
}
+ public boolean isTermPresent(String termName, ITermService termService) {
+ if(existingTerms.isEmpty()){
+ existingTerms = termService.list(State.class, null, null, null, null);
+ }
+ return existingTerms.stream().map(term->term.getLabel()).anyMatch(label->label.equals(termName));
+ }
+
Reference getCitation() {
return citation;
}
package eu.etaxonomy.cdm.io.plantglossary;
import java.io.File;
+import java.net.URI;
import java.util.Map;
-import java.util.UUID;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Component;
import eu.etaxonomy.cdm.io.csv.in.CsvImportBase;
+import eu.etaxonomy.cdm.model.common.Annotation;
+import eu.etaxonomy.cdm.model.common.AnnotationType;
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
+import eu.etaxonomy.cdm.model.common.Language;
import eu.etaxonomy.cdm.model.common.OriginalSourceType;
-import eu.etaxonomy.cdm.model.common.TermType;
import eu.etaxonomy.cdm.model.common.TermVocabulary;
import eu.etaxonomy.cdm.model.description.State;
*
*/
@Component
-public class PlantGlossaryCsvImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
+public class PlantGlossaryStateImport extends CsvImportBase<PlantGlossaryCsvImportConfigurator, PlantGlossaryCsvImportState, File>{
private static final long serialVersionUID = -5600766240192189822L;
- private static Logger logger = Logger.getLogger(PlantGlossaryCsvImport.class);
+ private static Logger logger = Logger.getLogger(PlantGlossaryStateImport.class);
+ final String HEADER_LABEL = "dcterms:identifier";
+ final String HEADER_DEFINITION = "definition";
+ final String HEADER_CATEGORY = "vann:termGroup";
+ final String HEADER_NOTES = "skos:example";
+ final String SOURCE_HEADER = "sourceDataset";
+ final String HEADER_URI = "term_URI";
@Override
protected void handleSingleLine(PlantGlossaryCsvImportState importState) {
- final String TERM_HEADER = "term";
- final String CATEGORY_HEADER = "category";
- final String HAS_SYN_HEADER = "hasSyn";
- final String SOURCE_HEADER = "sourceDataset";
- final String TERM_ID_HEADER = "termID";
- final String REMARK_HEADER = "remarks";
Map<String, String> currentRecord = importState.getCurrentRecord();
- if(!currentRecord.get(REMARK_HEADER).equals("active")){
- String message = String.format(
- "Line %s has obsolete data and was skipped", importState.getLine());
- logger.info(message);
+
+ String termLabel = currentRecord.get(HEADER_LABEL);
+ //check if already present
+ if(importState.isTermPresent(termLabel, getTermService())){
return;
}
- State stateTerm = State.NewInstance(null, currentRecord.get(TERM_HEADER), null);
- stateTerm.setUuid(UUID.fromString(currentRecord.get(TERM_ID_HEADER)));
+ State stateTerm = State.NewInstance(currentRecord.get(HEADER_DEFINITION), termLabel, null);
+ stateTerm.setUri(URI.create(currentRecord.get(HEADER_URI)));
+ stateTerm.addAnnotation(Annotation.NewInstance(currentRecord.get(HEADER_NOTES), AnnotationType.EDITORIAL(), Language.ENGLISH()));
- String vocName = currentRecord.get(CATEGORY_HEADER);
- TermVocabulary vocabulary = importState.checkVocabularies(vocName);
+ String vocName = currentRecord.get(HEADER_CATEGORY);
+ // TODO how should we handle multiple possible categories?
+ // for now we just take the first one
+ if(vocName.contains(",")){
+ vocName = vocName.split(",")[0];
+ }
+ TermVocabulary vocabulary = importState.checkVocabularies(vocName, getVocabularyService());
if(vocabulary==null){
- vocabulary = TermVocabulary.NewInstance(TermType.State, null, vocName, null, null);
- importState.addVocabulary(vocabulary);
+ logger.error("No vocabulary found for term: "+stateTerm+" with vocName: "+vocName);
+ return;
}
vocabulary.addTerm(stateTerm);
-
stateTerm.addSource(IdentifiableSource.NewInstance(OriginalSourceType.Import, importState.getCitation().getTitle(), null, importState.getCitation(), null));
getVocabularyService().saveOrUpdate(vocabulary);