latest changes to TaxonX import

author Andreas Müller <a.mueller@bgbm.org>

Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)

committer Andreas Müller <a.mueller@bgbm.org>

Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
author Andreas Müller <a.mueller@bgbm.org>
Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
committer Andreas Müller <a.mueller@bgbm.org>
Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
diff --git a/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java b/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java

index be7b6fa0ff940ceb289532ba07fe3d00ac97f076..e2f8fed54bb592c5e7d2487c9bb2477d7cc2fc65 100644 (file)
--- a/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java
+++ b/app-import/src/main/java/eu/etaxonomy/cdm/app/proibiosphere/TaxonXImportLauncher.java
@@ -48,35 +48,42 @@ public class TaxonXImportLauncher {
  
      //database validation status (create, update, validate ...)
      static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
+//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
      static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
-
+//  static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
+ 
      static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
+    
+    private enum FilterType{MODS, TAXON};
  
  
      static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
      static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
  
  
-    private static String askQuestion(String question){
-        Scanner scan = new Scanner(System.in);
-        System.out.println(question);
-        String index = scan.nextLine();
-        return index;
-    }
  
      public static void main(String[] args) {
-//        String[] taxonList = new String[] {"Eupolybothrus","Polybothrus"};
-       /*ants*/ String[] modsList = new String[] {"3924", "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"};
+       String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
+       
+       String[] taxonList = new String[]  {"Comaroma"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
+//       /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"  */};
  //        String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
-//                debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
+//        /*auch ants*/        debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
  //        suite: , };//,"3540555099"};
  //        modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
-        String tnomenclature = "ICZN";
+       taxonList = spiderModsList;
+       
+       FilterType filterType = FilterType.MODS;
+        
+       NomenclaturalCode tnomenclature = NomenclaturalCode.ICZN;
  
-        String defaultClassif="Ants";
+        String defaultClassification="Spiders";
+        boolean alwaysUseDefaultClassification = true;
  
-        Map<String,List<String>> documents = new HashMap<String,List<String>>();
-        HashMap<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
+        
+        
+        
+        Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
  
          /*HOW TO HANDLE SECUNDUM REFERENCE*/
          boolean reuseSecundum = askIfReuseSecundum();
@@ -85,34 +92,33 @@ public class TaxonXImportLauncher {
              secundum = askForSecundum();
          }
  
-//        checkTreatmentPresence("taxon",taxonList, documents,documentMap);
-        checkTreatmentPresence("modsid",modsList, documents,documentMap);
+        loadTreatmentIfPresent(filterType,taxonList, documentMap);
+//        loadTreatmentIfPresent(FilterType.MODS,modsList, documents,documentMap);
  
          TaxonXImportConfigurator taxonxImportConfigurator =null;
          CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
  
          ICdmDataSource destination = cdmDestination;
-        taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum);
+        taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
  
-        taxonxImportConfigurator.setImportClassificationName(defaultClassif);
+        taxonxImportConfigurator.setImportClassificationName(defaultClassification);
          log.info("Start import from  TaxonX Data");
  
          taxonxImportConfigurator.setLastImport(false);
  
          int j=0;
-        for (String document:documentMap.keySet()){
+        for (String document : documentMap.keySet()){
              j++;
              if (doImportDocument(document, documentMap.get(document).size())){
                  int i=0;
-                for (URI source:documentMap.get(document)){
+                for (URI source: documentMap.get(document)){
                      System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
                      i++;
                      if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
                          taxonxImportConfigurator.setLastImport(true);
                      }
-                        prepareReferenceAndSource(taxonxImportConfigurator,source);
-                    prepareNomenclature(taxonxImportConfigurator,tnomenclature);
-                    //   taxonxImportConfigurator.setTaxonReference(null);
+                    prepareReferenceAndSource(taxonxImportConfigurator,source);
+                     //   taxonxImportConfigurator.setTaxonReference(null);
                      taxonImport.invoke(taxonxImportConfigurator);
                      log.info("End import from SpecimenData ("+ source.toString() + ")...");
  
@@ -128,27 +134,11 @@ public class TaxonXImportLauncher {
      }
  
  
-
-    /**
-     * @param taxonxImportConfigurator
-     * @param tnomenclature
-     */
-    private static void prepareNomenclature(TaxonXImportConfigurator taxonxImportConfigurator, String tnomenclature) {
-        //            String tnomenclature = askQuestion("ICBN or ICZN ?");
-        taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
-        if (tnomenclature.equalsIgnoreCase("ICBN")) {
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
-            //                taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
-        }
-        if(tnomenclature.equalsIgnoreCase("ICZN")){
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
-            //                taxonxImportConfigurator.setClassificationName("Ants");
-        }
-        if(tnomenclature.equalsIgnoreCase("ICNB")){
-            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNB);
-            //                taxonxImportConfigurator.setClassificationName("Bacteria");
-        }
-
+    private static String askQuestion(String question){
+        Scanner scan = new Scanner(System.in);
+        System.out.println(question);
+        String index = scan.nextLine();
+        return index;
      }
  
      /**
@@ -162,8 +152,7 @@ public class TaxonXImportLauncher {
          String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
          reference.setTitleCache(tref,true);
          reference.setTitle(tref);
-        reference.generateTitle();
-
+        
          taxonxImportConfigurator.setSourceReference(reference);
          TaxonXImportConfigurator.setSourceRef(reference);
  
@@ -181,122 +170,57 @@ public class TaxonXImportLauncher {
       * @param destination
       * @param reuseSecundum
       * @param secundum
+     * @param tnomenclature 
+     * @param alwaysUseDefaultClassification 
       * @return
       */
-    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum) {
+    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
          TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
  
-        //        taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
+        //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
          taxonxImportConfigurator.setCheck(check);
          taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
          taxonxImportConfigurator.setDoAutomaticParsing(true);
  
          taxonxImportConfigurator.setInteractWithUser(true);
+        taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
  
+        taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
  
          taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
          if (!reuseSecundum) {
              taxonxImportConfigurator.setSecundum(secundum);
          }
  
-        //        taxonxImportConfigurator.setDoMatchTaxa(true);
-        //        taxonxImportConfigurator.setReUseTaxon(true);
+        //taxonxImportConfigurator.setDoMatchTaxa(true);
+        // taxonxImportConfigurator.setReUseTaxon(true);
          return taxonxImportConfigurator;
      }
  
      /**
-     * @param importFilter
+     * @param filterType
       * @param modsList
       * @param documents
       * @param documentMap
       * @return
       */
-    private static HashMap<String, List<URI>> checkTreatmentPresence(String importFilter, String[] modsList, Map<String, List<String>> documents, HashMap<String, List<URI>> documentMap) {
-        URL plaziURL;
-        //        System.out.println(plaziUrl);
+    private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
  
-        Map<String, List<String>> docs = new HashMap<String, List<String>>();
+       Map<String, List<String>> docs = new HashMap<String, List<String>>();
          try {
-            BufferedReader in=null;
              List<String> docList;
              String inputLine;
-            String docID;
-            String pageStart;
-            String pageEnd;
-            String taxon;
-            String link;
              String urlstr="";
  
-            for(String modsID : modsList){
-                //        plaziUrl=plaziUrl+"Eupolybothrus";
-                if (importFilter.equalsIgnoreCase("modsid")) {
-                    urlstr=plaziUrlDoc+modsID;
-                }
-                if (importFilter.equalsIgnoreCase("taxon")) {
-                    urlstr=plaziUrl+modsID;
-                }
-//                System.out.println(url);
-
-                plaziURL = new URL(urlstr);
-                in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
-
-
-                //TODO lastUpdate field
-                //            if(!plaziNotServer){
-                while ((inputLine = in.readLine()) != null) {
-                    System.out.println(inputLine);
-                    if (inputLine.startsWith("<treatment ")){
-                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
-                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
-                        System.out.println("docID: "+docID);
-                        link=inputLine.split("link=\"")[1].split("\"")[0];
-                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
-                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
-                        docList = documents.get(docID);
-                        if (docList == null) {
-                            docList = new ArrayList<String>();
-                        }
-                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
-                        documents.put(docID,docList);
-                    }
-                }
-            }
-            System.out.println("hop");
-
-
-
-            for (String docId:documents.keySet()){
-                in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
-                while ((inputLine = in.readLine()) != null) {
-                    if (inputLine.startsWith("<treatment ")){
-                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
-                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
-                        link=inputLine.split("link=\"")[1].split("\"")[0];
-                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
-                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
-                        docList = documents.get(docID);
-                        if (docList == null) {
-                            docList = new ArrayList<String>();
-                        }
-                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
-                        docs.put(docID,docList);
-                    }
-                }
-            }
-            //            if(plaziNotServer) {
-            //                sourcesStr.add(plaziUrl);
-            //            }
-            //            in.close();
-        } catch (MalformedURLException e1) {
-            // TODO Auto-generated catch block
+            Map<String,List<String>> documents =  fillDocumentMap(filterType, filterList, urlstr);
+
+//            checkTreatmentAvailable(documents, docs);
+            docs = documents;
+
+        } catch (Exception e1) {
              e1.printStackTrace();
-        } catch (IOException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
          }
  
-        //        System.exit(0);
-
          //        sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
  
          //System.out.println(documents);
@@ -397,11 +321,83 @@ public class TaxonXImportLauncher {
  
      }
  
+       private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
+                       throws IOException, MalformedURLException {
+               List<String> docList;
+               String inputLine;
+               for (String docId:documents.keySet()){
+                       URL url = new URL(plaziUrlDoc+docId);
+                       BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
+                   while ((inputLine = in.readLine()) != null) {
+                       if (inputLine.startsWith("<treatment ")){
+                           String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
+                           String docID=inputLine.split("docId=\"")[1].split("\"")[0];
+                           String link=inputLine.split("link=\"")[1].split("\"")[0];
+                           String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
+                           String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
+                           docList = documents.get(docID);
+                           if (docList == null) {
+                               docList = new ArrayList<String>();
+                           }
+                           docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
+                           docs.put(docID,docList);
+                       }
+                   }
+               }
+       }
+
+       private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
+                       String[] filterList, String urlstr) 
+                                       throws MalformedURLException, IOException {
+               
+               Map<String, List<String>> documents = new HashMap<String, List<String>>();
+               List<String> docList;
+               String inputLine;
+               for(String filter : filterList){
+                   //        plaziUrl=plaziUrl+"Eupolybothrus";
+                   if (filterType == FilterType.MODS) {
+                       urlstr=plaziUrlDoc + filter;
+                   }else if (filterType == FilterType.TAXON) {
+                       urlstr=plaziUrl + filter;
+                   }
+                   log.info("URLstr: " + urlstr);
+
+                   URL plaziURL = new URL(urlstr);
+                   BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
+
+
+                   //TODO lastUpdate field
+                   //            if(!plaziNotServer){
+                   while ((inputLine = in.readLine()) != null) {
+                       System.out.println(inputLine);
+                       if (inputLine.startsWith("<treatment ")){
+                           String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
+                           String docID=inputLine.split("docId=\"")[1].split("\"")[0];
+                           System.out.println("docID: "+docID);
+                           
+                           String link=inputLine.split("link=\"")[1].split("\"")[0];
+                           String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
+                           String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
+                           docList = documents.get(docID);
+                           if (docList == null) {
+                               docList = new ArrayList<String>();
+                           }
+                           docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
+                           documents.put(docID,docList);
+                       }
+                   }
+               }
+               System.out.println("documents created");
+               
+               return documents;
+       }
+
      /**
       * @param document
       * @return
       */
      private static boolean doImportDocument(String document, int nbtreatments) {
+
          if (nbtreatments>400) {
              return false;
          }
author	Andreas Müller <a.mueller@bgbm.org>
	Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)
committer	Andreas Müller <a.mueller@bgbm.org>
	Thu, 5 Jun 2014 14:03:27 +0000 (14:03 +0000)