latest changes to TaxonX import
authorAndreas Müller <a.mueller@bgbm.org>
Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
committerAndreas Müller <a.mueller@bgbm.org>
Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java

index be7b6fa0ff940ceb289532ba07fe3d00ac97f076..e2f8fed54bb592c5e7d2487c9bb2477d7cc2fc65 100644 (file)
@@ -48,35 +48,42 @@ public class TaxonXImportLauncher {
 
     //database validation status (create, update, validate ...)
     static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
+//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
     static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
-
+//  static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
     static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
+    
+    private enum FilterType{MODS, TAXON};
 
 
     static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
     static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
 
 
-    private static String askQuestion(String question){
-        Scanner scan = new Scanner(System.in);
-        System.out.println(question);
-        String index = scan.nextLine();
-        return index;
-    }
 
     public static void main(String[] args) {
-//        String[] taxonList = new String[] {"Eupolybothrus","Polybothrus"};
-       /*ants*/ String[] modsList = new String[] {"3924", "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"};
+       String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
+       
+       String[] taxonList = new String[]  {"Comaroma"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
+//       /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"  */};
 //        String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
-//                debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
+//        /*auch ants*/        debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
 //        suite: , };//,"3540555099"};
 //        modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
-        String tnomenclature = "ICZN";
+       taxonList = spiderModsList;
+       
+       FilterType filterType = FilterType.MODS;
+        
+       NomenclaturalCode tnomenclature = NomenclaturalCode.ICZN;
 
-        String defaultClassif="Ants";
+        String defaultClassification="Spiders";
+        boolean alwaysUseDefaultClassification = true;
 
-        Map<String,List<String>> documents = new HashMap<String,List<String>>();
-        HashMap<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
+        
+        
+        
+        Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
 
         /*HOW TO HANDLE SECUNDUM REFERENCE*/
         boolean reuseSecundum = askIfReuseSecundum();
@@ -85,34 +92,33 @@ public class TaxonXImportLauncher {
             secundum = askForSecundum();
         }
 
-//        checkTreatmentPresence("taxon",taxonList, documents,documentMap);
-        checkTreatmentPresence("modsid",modsList, documents,documentMap);
+        loadTreatmentIfPresent(filterType,taxonList, documentMap);
+//        loadTreatmentIfPresent(FilterType.MODS,modsList, documents,documentMap);
 
         TaxonXImportConfigurator taxonxImportConfigurator =null;
         CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
 
         ICdmDataSource destination = cdmDestination;
-        taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum);
+        taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
 
-        taxonxImportConfigurator.setImportClassificationName(defaultClassif);
+        taxonxImportConfigurator.setImportClassificationName(defaultClassification);
         log.info("Start import from  TaxonX Data");
 
         taxonxImportConfigurator.setLastImport(false);
 
         int j=0;
-        for (String document:documentMap.keySet()){
+        for (String document : documentMap.keySet()){
             j++;
             if (doImportDocument(document, documentMap.get(document).size())){
                 int i=0;
-                for (URI source:documentMap.get(document)){
+                for (URI source: documentMap.get(document)){
                     System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
                     i++;
                     if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
                         taxonxImportConfigurator.setLastImport(true);
                     }
-                        prepareReferenceAndSource(taxonxImportConfigurator,source);
-                    prepareNomenclature(taxonxImportConfigurator,tnomenclature);
-                    //   taxonxImportConfigurator.setTaxonReference(null);
+                    prepareReferenceAndSource(taxonxImportConfigurator,source);
+                     //   taxonxImportConfigurator.setTaxonReference(null);
                     taxonImport.invoke(taxonxImportConfigurator);
                     log.info("End import from SpecimenData ("+ source.toString() + ")...");
 
@@ -128,27 +134,11 @@ public class TaxonXImportLauncher {
     }
 
 
-
-    /**
-     * @param taxonxImportConfigurator
-     * @param tnomenclature
-     */
-    private static void prepareNomenclature(TaxonXImportConfigurator taxonxImportConfigurator, String tnomenclature) {
-        //            String tnomenclature = askQuestion("ICBN or ICZN ?");
-        taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
-        if (tnomenclature.equalsIgnoreCase("ICBN")) {
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
-            //                taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
-        }
-        if(tnomenclature.equalsIgnoreCase("ICZN")){
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
-            //                taxonxImportConfigurator.setClassificationName("Ants");
-        }
-        if(tnomenclature.equalsIgnoreCase("ICNB")){
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNB);
-            //                taxonxImportConfigurator.setClassificationName("Bacteria");
-        }
-
+    private static String askQuestion(String question){
+        Scanner scan = new Scanner(System.in);
+        System.out.println(question);
+        String index = scan.nextLine();
+        return index;
     }
 
     /**
@@ -162,8 +152,7 @@ public class TaxonXImportLauncher {
         String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
         reference.setTitleCache(tref,true);
         reference.setTitle(tref);
-        reference.generateTitle();
-
+        
         taxonxImportConfigurator.setSourceReference(reference);
         TaxonXImportConfigurator.setSourceRef(reference);
 
@@ -181,122 +170,57 @@ public class TaxonXImportLauncher {
      * @param destination
      * @param reuseSecundum
      * @param secundum
+     * @param tnomenclature 
+     * @param alwaysUseDefaultClassification 
      * @return
      */
-    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum) {
+    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
         TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
 
-        //        taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
+        //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
         taxonxImportConfigurator.setCheck(check);
         taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
         taxonxImportConfigurator.setDoAutomaticParsing(true);
 
         taxonxImportConfigurator.setInteractWithUser(true);
+        taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
 
+        taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
 
         taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
         if (!reuseSecundum) {
             taxonxImportConfigurator.setSecundum(secundum);
         }
 
-        //        taxonxImportConfigurator.setDoMatchTaxa(true);
-        //        taxonxImportConfigurator.setReUseTaxon(true);
+        //taxonxImportConfigurator.setDoMatchTaxa(true);
+        // taxonxImportConfigurator.setReUseTaxon(true);
         return taxonxImportConfigurator;
     }
 
     /**
-     * @param importFilter
+     * @param filterType
      * @param modsList
      * @param documents
      * @param documentMap
      * @return
      */
-    private static HashMap<String, List<URI>> checkTreatmentPresence(String importFilter, String[] modsList, Map<String, List<String>> documents, HashMap<String, List<URI>> documentMap) {
-        URL plaziURL;
-        //        System.out.println(plaziUrl);
+    private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
 
-        Map<String, List<String>> docs = new HashMap<String, List<String>>();
+       Map<String, List<String>> docs = new HashMap<String, List<String>>();
         try {
-            BufferedReader in=null;
             List<String> docList;
             String inputLine;
-            String docID;
-            String pageStart;
-            String pageEnd;
-            String taxon;
-            String link;
             String urlstr="";
 
-            for(String modsID : modsList){
-                //        plaziUrl=plaziUrl+"Eupolybothrus";
-                if (importFilter.equalsIgnoreCase("modsid")) {
-                    urlstr=plaziUrlDoc+modsID;
-                }
-                if (importFilter.equalsIgnoreCase("taxon")) {
-                    urlstr=plaziUrl+modsID;
-                }
-//                System.out.println(url);
-
-                plaziURL = new URL(urlstr);
-                in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
-
-
-                //TODO lastUpdate field
-                //            if(!plaziNotServer){
-                while ((inputLine = in.readLine()) != null) {
-                    System.out.println(inputLine);
-                    if (inputLine.startsWith("<treatment ")){
-                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
-                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
-                        System.out.println("docID: "+docID);
-                        link=inputLine.split("link=\"")[1].split("\"")[0];
-                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
-                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
-                        docList = documents.get(docID);
-                        if (docList == null) {
-                            docList = new ArrayList<String>();
-                        }
-                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
-                        documents.put(docID,docList);
-                    }
-                }
-            }
-            System.out.println("hop");
-
-
-
-            for (String docId:documents.keySet()){
-                in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
-                while ((inputLine = in.readLine()) != null) {
-                    if (inputLine.startsWith("<treatment ")){
-                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
-                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
-                        link=inputLine.split("link=\"")[1].split("\"")[0];
-                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
-                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
-                        docList = documents.get(docID);
-                        if (docList == null) {
-                            docList = new ArrayList<String>();
-                        }
-                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
-                        docs.put(docID,docList);
-                    }
-                }
-            }
-            //            if(plaziNotServer) {
-            //                sourcesStr.add(plaziUrl);
-            //            }
-            //            in.close();
-        } catch (MalformedURLException e1) {
-            // TODO Auto-generated catch block
+            Map<String,List<String>> documents =  fillDocumentMap(filterType, filterList, urlstr);
+
+//            checkTreatmentAvailable(documents, docs);
+            docs = documents;
+
+        } catch (Exception e1) {
             e1.printStackTrace();
-        } catch (IOException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
         }
 
-        //        System.exit(0);
-
         //        sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
 
         //System.out.println(documents);
@@ -397,11 +321,83 @@ public class TaxonXImportLauncher {
 
     }
 
+       private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
+                       throws IOException, MalformedURLException {
+               List<String> docList;
+               String inputLine;
+               for (String docId:documents.keySet()){
+                       URL url = new URL(plaziUrlDoc+docId);
+                       BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
+                   while ((inputLine = in.readLine()) != null) {
+                       if (inputLine.startsWith("<treatment ")){
+                           String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
+                           String docID=inputLine.split("docId=\"")[1].split("\"")[0];
+                           String link=inputLine.split("link=\"")[1].split("\"")[0];
+                           String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
+                           String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
+                           docList = documents.get(docID);
+                           if (docList == null) {
+                               docList = new ArrayList<String>();
+                           }
+                           docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
+                           docs.put(docID,docList);
+                       }
+                   }
+               }
+       }
+
+       private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
+                       String[] filterList, String urlstr) 
+                                       throws MalformedURLException, IOException {
+               
+               Map<String, List<String>> documents = new HashMap<String, List<String>>();
+               List<String> docList;
+               String inputLine;
+               for(String filter : filterList){
+                   //        plaziUrl=plaziUrl+"Eupolybothrus";
+                   if (filterType == FilterType.MODS) {
+                       urlstr=plaziUrlDoc + filter;
+                   }else if (filterType == FilterType.TAXON) {
+                       urlstr=plaziUrl + filter;
+                   }
+                   log.info("URLstr: " + urlstr);
+
+                   URL plaziURL = new URL(urlstr);
+                   BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
+
+
+                   //TODO lastUpdate field
+                   //            if(!plaziNotServer){
+                   while ((inputLine = in.readLine()) != null) {
+                       System.out.println(inputLine);
+                       if (inputLine.startsWith("<treatment ")){
+                           String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
+                           String docID=inputLine.split("docId=\"")[1].split("\"")[0];
+                           System.out.println("docID: "+docID);
+                           
+                           String link=inputLine.split("link=\"")[1].split("\"")[0];
+                           String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
+                           String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
+                           docList = documents.get(docID);
+                           if (docList == null) {
+                               docList = new ArrayList<String>();
+                           }
+                           docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
+                           documents.put(docID,docList);
+                       }
+                   }
+               }
+               System.out.println("documents created");
+               
+               return documents;
+       }
+
     /**
      * @param document
      * @return
      */
     private static boolean doImportDocument(String document, int nbtreatments) {
+
         if (nbtreatments>400) {
             return false;
         }