From: Andreas Müller Date: Thu, 5 Jun 2014 14:03:27 +0000 (+0000) Subject: latest changes to TaxonX import X-Git-Url: https://dev.e-taxonomy.eu/gitweb/cdmlib-apps.git/commitdiff_plain/3760b5bd011be77248352cc8dec28eb7f24b27bc latest changes to TaxonX import --- diff --git a/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java b/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java index be7b6fa0..e2f8fed5 100644 --- a/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java +++ b/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java @@ -48,35 +48,42 @@ public class TaxonXImportLauncher { //database validation status (create, update, validate ...) static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE; +// static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql(); static final ICdmDataSource cdmDestination = CdmDestinations.localH2(); - +// static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test(); + static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK; + + private enum FilterType{MODS, TAXON}; static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName="; static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID="; - private static String askQuestion(String question){ - Scanner scan = new Scanner(System.in); - System.out.println(question); - String index = scan.nextLine(); - return index; - } public static void main(String[] args) { -// String[] taxonList = new String[] {"Eupolybothrus","Polybothrus"}; - /*ants*/ String[] modsList = new String[] {"3924", "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"}; + String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"}; + + String[] taxonList = new String[] {"Comaroma"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders) +// /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071" */}; // String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"}; -// debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"}; +// /*auch ants*/ debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"}; // suite: , };//,"3540555099"}; // modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"}; - String tnomenclature = "ICZN"; + taxonList = spiderModsList; + + FilterType filterType = FilterType.MODS; + + NomenclaturalCode tnomenclature = NomenclaturalCode.ICZN; - String defaultClassif="Ants"; + String defaultClassification="Spiders"; + boolean alwaysUseDefaultClassification = true; - Map> documents = new HashMap>(); - HashMap>documentMap = new HashMap>(); + + + + Map>documentMap = new HashMap>(); /*HOW TO HANDLE SECUNDUM REFERENCE*/ boolean reuseSecundum = askIfReuseSecundum(); @@ -85,34 +92,33 @@ public class TaxonXImportLauncher { secundum = askForSecundum(); } -// checkTreatmentPresence("taxon",taxonList, documents,documentMap); - checkTreatmentPresence("modsid",modsList, documents,documentMap); + loadTreatmentIfPresent(filterType,taxonList, documentMap); +// loadTreatmentIfPresent(FilterType.MODS,modsList, documents,documentMap); TaxonXImportConfigurator taxonxImportConfigurator =null; CdmDefaultImport taxonImport = new CdmDefaultImport(); ICdmDataSource destination = cdmDestination; - taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum); + taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification); - taxonxImportConfigurator.setImportClassificationName(defaultClassif); + taxonxImportConfigurator.setImportClassificationName(defaultClassification); log.info("Start import from TaxonX Data"); taxonxImportConfigurator.setLastImport(false); int j=0; - for (String document:documentMap.keySet()){ + for (String document : documentMap.keySet()){ j++; if (doImportDocument(document, documentMap.get(document).size())){ int i=0; - for (URI source:documentMap.get(document)){ + for (URI source: documentMap.get(document)){ System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath()); i++; if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) { taxonxImportConfigurator.setLastImport(true); } - prepareReferenceAndSource(taxonxImportConfigurator,source); - prepareNomenclature(taxonxImportConfigurator,tnomenclature); - // taxonxImportConfigurator.setTaxonReference(null); + prepareReferenceAndSource(taxonxImportConfigurator,source); + // taxonxImportConfigurator.setTaxonReference(null); taxonImport.invoke(taxonxImportConfigurator); log.info("End import from SpecimenData ("+ source.toString() + ")..."); @@ -128,27 +134,11 @@ public class TaxonXImportLauncher { } - - /** - * @param taxonxImportConfigurator - * @param tnomenclature - */ - private static void prepareNomenclature(TaxonXImportConfigurator taxonxImportConfigurator, String tnomenclature) { - // String tnomenclature = askQuestion("ICBN or ICZN ?"); - taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP); - if (tnomenclature.equalsIgnoreCase("ICBN")) { - taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP); - // taxonxImportConfigurator.setClassificationName("Chenopodiaceae"); - } - if(tnomenclature.equalsIgnoreCase("ICZN")){ - taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN); - // taxonxImportConfigurator.setClassificationName("Ants"); - } - if(tnomenclature.equalsIgnoreCase("ICNB")){ - taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNB); - // taxonxImportConfigurator.setClassificationName("Bacteria"); - } - + private static String askQuestion(String question){ + Scanner scan = new Scanner(System.in); + System.out.println(question); + String index = scan.nextLine(); + return index; } /** @@ -162,8 +152,7 @@ public class TaxonXImportLauncher { String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1]; reference.setTitleCache(tref,true); reference.setTitle(tref); - reference.generateTitle(); - + taxonxImportConfigurator.setSourceReference(reference); TaxonXImportConfigurator.setSourceRef(reference); @@ -181,122 +170,57 @@ public class TaxonXImportLauncher { * @param destination * @param reuseSecundum * @param secundum + * @param tnomenclature + * @param alwaysUseDefaultClassification * @return */ - private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference secundum) { + private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) { TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination); - // taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle()); + //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle()); taxonxImportConfigurator.setCheck(check); taxonxImportConfigurator.setDbSchemaValidation(hbm2dll); taxonxImportConfigurator.setDoAutomaticParsing(true); taxonxImportConfigurator.setInteractWithUser(true); + taxonxImportConfigurator.setNomenclaturalCode(tnomenclature); + taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification); taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum); if (!reuseSecundum) { taxonxImportConfigurator.setSecundum(secundum); } - // taxonxImportConfigurator.setDoMatchTaxa(true); - // taxonxImportConfigurator.setReUseTaxon(true); + //taxonxImportConfigurator.setDoMatchTaxa(true); + // taxonxImportConfigurator.setReUseTaxon(true); return taxonxImportConfigurator; } /** - * @param importFilter + * @param filterType * @param modsList * @param documents * @param documentMap * @return */ - private static HashMap> checkTreatmentPresence(String importFilter, String[] modsList, Map> documents, HashMap> documentMap) { - URL plaziURL; - // System.out.println(plaziUrl); + private static Map> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map> documentMap) { - Map> docs = new HashMap>(); + Map> docs = new HashMap>(); try { - BufferedReader in=null; List docList; String inputLine; - String docID; - String pageStart; - String pageEnd; - String taxon; - String link; String urlstr=""; - for(String modsID : modsList){ - // plaziUrl=plaziUrl+"Eupolybothrus"; - if (importFilter.equalsIgnoreCase("modsid")) { - urlstr=plaziUrlDoc+modsID; - } - if (importFilter.equalsIgnoreCase("taxon")) { - urlstr=plaziUrl+modsID; - } -// System.out.println(url); - - plaziURL = new URL(urlstr); - in = new BufferedReader(new InputStreamReader(plaziURL.openStream())); - - - //TODO lastUpdate field - // if(!plaziNotServer){ - while ((inputLine = in.readLine()) != null) { - System.out.println(inputLine); - if (inputLine.startsWith("(); - } - docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link); - documents.put(docID,docList); - } - } - } - System.out.println("hop"); - - - - for (String docId:documents.keySet()){ - in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream())); - while ((inputLine = in.readLine()) != null) { - if (inputLine.startsWith("(); - } - docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link); - docs.put(docID,docList); - } - } - } - // if(plaziNotServer) { - // sourcesStr.add(plaziUrl); - // } - // in.close(); - } catch (MalformedURLException e1) { - // TODO Auto-generated catch block + Map> documents = fillDocumentMap(filterType, filterList, urlstr); + +// checkTreatmentAvailable(documents, docs); + docs = documents; + + } catch (Exception e1) { e1.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); } - // System.exit(0); - // sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml"); //System.out.println(documents); @@ -397,11 +321,83 @@ public class TaxonXImportLauncher { } + private static void checkTreatmentAvailable(Map> documents, Map> docs) + throws IOException, MalformedURLException { + List docList; + String inputLine; + for (String docId:documents.keySet()){ + URL url = new URL(plaziUrlDoc+docId); + BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); + while ((inputLine = in.readLine()) != null) { + if (inputLine.startsWith("(); + } + docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link); + docs.put(docID,docList); + } + } + } + } + + private static Map> fillDocumentMap(FilterType filterType, + String[] filterList, String urlstr) + throws MalformedURLException, IOException { + + Map> documents = new HashMap>(); + List docList; + String inputLine; + for(String filter : filterList){ + // plaziUrl=plaziUrl+"Eupolybothrus"; + if (filterType == FilterType.MODS) { + urlstr=plaziUrlDoc + filter; + }else if (filterType == FilterType.TAXON) { + urlstr=plaziUrl + filter; + } + log.info("URLstr: " + urlstr); + + URL plaziURL = new URL(urlstr); + BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream())); + + + //TODO lastUpdate field + // if(!plaziNotServer){ + while ((inputLine = in.readLine()) != null) { + System.out.println(inputLine); + if (inputLine.startsWith("(); + } + docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link); + documents.put(docID,docList); + } + } + } + System.out.println("documents created"); + + return documents; + } + /** * @param document * @return */ private static boolean doImportDocument(String document, int nbtreatments) { + if (nbtreatments>400) { return false; }