2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.app
.proibiosphere
;
11 import java
.awt
.Dimension
;
12 import java
.io
.BufferedReader
;
13 import java
.io
.IOException
;
14 import java
.io
.InputStreamReader
;
15 import java
.net
.MalformedURLException
;
17 import java
.net
.URISyntaxException
;
19 import java
.util
.ArrayList
;
20 import java
.util
.Collections
;
21 import java
.util
.HashMap
;
22 import java
.util
.HashSet
;
23 import java
.util
.List
;
25 import java
.util
.Scanner
;
27 import javax
.swing
.JOptionPane
;
28 import javax
.swing
.JScrollPane
;
29 import javax
.swing
.JTextArea
;
31 import org
.apache
.log4j
.Logger
;
33 import eu
.etaxonomy
.cdm
.app
.common
.CdmDestinations
;
34 import eu
.etaxonomy
.cdm
.database
.DbSchemaValidation
;
35 import eu
.etaxonomy
.cdm
.database
.ICdmDataSource
;
36 import eu
.etaxonomy
.cdm
.io
.common
.CdmDefaultImport
;
37 import eu
.etaxonomy
.cdm
.io
.common
.IImportConfigurator
.CHECK
;
38 import eu
.etaxonomy
.cdm
.io
.taxonx2013
.TaxonXImportConfigurator
;
39 import eu
.etaxonomy
.cdm
.model
.name
.NomenclaturalCode
;
40 import eu
.etaxonomy
.cdm
.model
.reference
.Reference
;
41 import eu
.etaxonomy
.cdm
.model
.reference
.ReferenceFactory
;
45 public class TaxonXImportLauncher
{
46 private static final Logger log
= Logger
.getLogger(TaxonXImportLauncher
.class);
47 // private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
49 //database validation status (create, update, validate ...)
50 static DbSchemaValidation hbm2dll
= DbSchemaValidation
.CREATE
;
51 // static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
52 static final ICdmDataSource cdmDestination
= CdmDestinations
.localH2();
53 // static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
55 static final CHECK check
= CHECK
.IMPORT_WITHOUT_CHECK
;
57 private enum FilterType
{MODS
, TAXON
};
60 static String plaziUrl
= "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
61 static String plaziUrlDoc
= "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
65 public static void main(String
[] args
) {
66 String
[] spiderModsList
= new String
[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
68 String
[] taxonList
= new String
[] {"Comaroma"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
69 // /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071" */};
70 // String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
71 // /*auch ants*/ debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
72 // suite: , };//,"3540555099"};
73 // modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
74 taxonList
= spiderModsList
;
76 FilterType filterType
= FilterType
.MODS
;
78 NomenclaturalCode tnomenclature
= NomenclaturalCode
.ICZN
;
80 String defaultClassification
="Spiders";
81 boolean alwaysUseDefaultClassification
= true;
86 Map
<String
,List
<URI
>>documentMap
= new HashMap
<String
, List
<URI
>>();
88 /*HOW TO HANDLE SECUNDUM REFERENCE*/
89 boolean reuseSecundum
= askIfReuseSecundum();
90 Reference
<?
> secundum
= null;
92 secundum
= askForSecundum();
95 loadTreatmentIfPresent(filterType
,taxonList
, documentMap
);
96 // loadTreatmentIfPresent(FilterType.MODS,modsList, documents,documentMap);
98 TaxonXImportConfigurator taxonxImportConfigurator
=null;
99 CdmDefaultImport
<TaxonXImportConfigurator
> taxonImport
= new CdmDefaultImport
<TaxonXImportConfigurator
>();
101 ICdmDataSource destination
= cdmDestination
;
102 taxonxImportConfigurator
= prepareTaxonXImport(destination
,reuseSecundum
, secundum
, tnomenclature
, alwaysUseDefaultClassification
);
104 taxonxImportConfigurator
.setImportClassificationName(defaultClassification
);
105 log
.info("Start import from TaxonX Data");
107 taxonxImportConfigurator
.setLastImport(false);
110 for (String document
: documentMap
.keySet()){
112 if (doImportDocument(document
, documentMap
.get(document
).size())){
114 for (URI source
: documentMap
.get(document
)){
115 System
.out
.println("START "+document
+" "+i
+" ("+(documentMap
.get(document
)).size()+"): "+source
.getPath());
117 if (j
==documentMap
.keySet().size() && i
==documentMap
.get(document
).size()) {
118 taxonxImportConfigurator
.setLastImport(true);
120 prepareReferenceAndSource(taxonxImportConfigurator
,source
);
121 // taxonxImportConfigurator.setTaxonReference(null);
122 taxonImport
.invoke(taxonxImportConfigurator
);
123 log
.info("End import from SpecimenData ("+ source
.toString() + ")...");
126 // ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
127 // int count = app.getAgentService().deduplicate(Person.class, null, null);
128 // logger.warn("Deduplicated " + count + " persons.");
129 // count = app.getReferenceService().deduplicate(Reference.class, null, null);
130 // logger.warn("Deduplicated " + count + " references.");
137 private static String
askQuestion(String question
){
138 Scanner scan
= new Scanner(System
.in
);
139 System
.out
.println(question
);
140 String index
= scan
.nextLine();
145 * @param taxonxImportConfigurator
149 private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator
, URI source
) {
150 Reference
<?
> reference
= ReferenceFactory
.newGeneric();
151 // String tref = askQuestion("Import source? (ie Plazi document ID)");
152 String tref
="PLAZI - "+source
.getPath().split("/")[source
.getPath().split("/").length
-1];
153 reference
.setTitleCache(tref
,true);
154 reference
.setTitle(tref
);
156 taxonxImportConfigurator
.setSourceReference(reference
);
157 TaxonXImportConfigurator
.setSourceRef(reference
);
159 Reference
<?
> referenceUrl
= ReferenceFactory
.newWebPage();
160 referenceUrl
.setTitleCache(source
.toString(), true);
161 referenceUrl
.setTitle(source
.toString());
162 reference
.setUri(source
);
163 referenceUrl
.generateTitle();
165 taxonxImportConfigurator
.addOriginalSource(referenceUrl
);
166 taxonxImportConfigurator
.setSource(source
);
171 * @param reuseSecundum
173 * @param tnomenclature
174 * @param alwaysUseDefaultClassification
177 private static TaxonXImportConfigurator
prepareTaxonXImport(ICdmDataSource destination
, boolean reuseSecundum
, Reference
<?
> secundum
, NomenclaturalCode tnomenclature
, boolean alwaysUseDefaultClassification
) {
178 TaxonXImportConfigurator taxonxImportConfigurator
= TaxonXImportConfigurator
.NewInstance(destination
);
180 //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
181 taxonxImportConfigurator
.setCheck(check
);
182 taxonxImportConfigurator
.setDbSchemaValidation(hbm2dll
);
183 taxonxImportConfigurator
.setDoAutomaticParsing(true);
185 taxonxImportConfigurator
.setInteractWithUser(true);
186 taxonxImportConfigurator
.setNomenclaturalCode(tnomenclature
);
188 taxonxImportConfigurator
.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification
);
190 taxonxImportConfigurator
.setKeepOriginalSecundum(reuseSecundum
);
191 if (!reuseSecundum
) {
192 taxonxImportConfigurator
.setSecundum(secundum
);
195 //taxonxImportConfigurator.setDoMatchTaxa(true);
196 // taxonxImportConfigurator.setReUseTaxon(true);
197 return taxonxImportConfigurator
;
207 private static Map
<String
, List
<URI
>> loadTreatmentIfPresent(FilterType filterType
, String
[] filterList
, Map
<String
, List
<URI
>> documentMap
) {
209 Map
<String
, List
<String
>> docs
= new HashMap
<String
, List
<String
>>();
211 List
<String
> docList
;
215 Map
<String
,List
<String
>> documents
= fillDocumentMap(filterType
, filterList
, urlstr
);
217 // checkTreatmentAvailable(documents, docs);
220 } catch (Exception e1
) {
221 e1
.printStackTrace();
224 // sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
226 //System.out.println(documents);
227 for (String docId
: docs
.keySet()){
228 List
<String
> treatments
= new ArrayList
<String
>(new HashSet
<String
>(docs
.get(docId
)));
230 Map
<Integer
, List
<String
>> startPages
= new HashMap
<Integer
, List
<String
>>();
231 for (String treatment
:treatments
) {
232 List
<String
>tmplist
= startPages
.get(Integer
.valueOf(treatment
.split("---")[0]));
233 if (tmplist
== null) {
234 tmplist
= new ArrayList
<String
>();
236 tmplist
.add(treatment
.split("---")[3]);
237 startPages
.put(Integer
.valueOf(treatment
.split("---")[0]),tmplist
);
239 List
<Integer
> pages
= new ArrayList
<Integer
>();
240 pages
.addAll(startPages
.keySet());
242 Collections
.sort(pages
);
245 log
.info("Document "+docId
+" should have "+treatments
.size()+" treatments");
246 //don't test if all the treatments are really online, it should be working without problems now
248 // if(treatments.size()<150){
250 // for (String source:treatments){
251 // DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
252 // DocumentBuilder builder;
256 // builder = factory.newDocumentBuilder();
257 // url = new URL(source.split("---")[3]);
258 // Object o = url.getContent();
259 // InputStream is = (InputStream) o;
260 // Document document = builder.parse(is);
262 // }catch(Exception e){
263 // // e.printStackTrace();
267 // log.info("Document "+docId+" has "+cnt+" treatments available");
269 // if(treatments.size() != cnt)
271 // File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
272 // FileWriter writer;
274 // writer = new FileWriter(file ,true);
275 // writer.write(docId+"\n");
278 // } catch (IOException e1) {
279 // // TODO Auto-generated catch block
280 // e1.printStackTrace();
285 List
<URI
> uritmp
= documentMap
.get(docId
);
286 if (uritmp
== null) {
287 uritmp
= new ArrayList
<URI
>();
289 for (int page
:pages
) {
290 for (String treatment
: startPages
.get(page
)) {
292 uritmp
.add(new URL(treatment
).toURI());
293 } catch (MalformedURLException e
) {
294 // TODO Auto-generated catch block
296 } catch (URISyntaxException e
) {
297 // TODO Auto-generated catch block
302 documentMap
.put(docId
, uritmp
);
309 ////// log.info("NB SOURCES : "+sourcesStr.size());
310 // List<URI> sourcesStr = new ArrayList<URI>();
312 //// documentMap = new HashMap<String, List<URI>>();
313 // sourcesStr.add(new URI("http://plazi.cs.umb.edu/GgServer/cdmSync/8F5B3EA099D371BC41CC5DDBFEDCFBED"));
314 // documentMap.put("singlesource", sourcesStr);
315 // } catch (URISyntaxException e) {
316 // // TODO Auto-generated catch block
317 // e.printStackTrace();
324 private static void checkTreatmentAvailable(Map
<String
, List
<String
>> documents
, Map
<String
, List
<String
>> docs
)
325 throws IOException
, MalformedURLException
{
326 List
<String
> docList
;
328 for (String docId
:documents
.keySet()){
329 URL url
= new URL(plaziUrlDoc
+docId
);
330 BufferedReader in
= new BufferedReader(new InputStreamReader(url
.openStream()));
331 while ((inputLine
= in
.readLine()) != null) {
332 if (inputLine
.startsWith("<treatment ")){
333 String taxon
= inputLine
.split("taxon=\"")[1].split("\"")[0];
334 String docID
=inputLine
.split("docId=\"")[1].split("\"")[0];
335 String link
=inputLine
.split("link=\"")[1].split("\"")[0];
336 String pageStart
= inputLine
.split("startPage=\"")[1].split("\"")[0];
337 String pageEnd
= inputLine
.split("endPage=\"")[1].split("\"")[0];
338 docList
= documents
.get(docID
);
339 if (docList
== null) {
340 docList
= new ArrayList
<String
>();
342 docList
.add(pageStart
+"---" + pageEnd
+ "---" + taxon
+ "---"+link
);
343 docs
.put(docID
,docList
);
349 private static Map
<String
, List
<String
>> fillDocumentMap(FilterType filterType
,
350 String
[] filterList
, String urlstr
)
351 throws MalformedURLException
, IOException
{
353 Map
<String
, List
<String
>> documents
= new HashMap
<String
, List
<String
>>();
354 List
<String
> docList
;
356 for(String filter
: filterList
){
357 // plaziUrl=plaziUrl+"Eupolybothrus";
358 if (filterType
== FilterType
.MODS
) {
359 urlstr
=plaziUrlDoc
+ filter
;
360 }else if (filterType
== FilterType
.TAXON
) {
361 urlstr
=plaziUrl
+ filter
;
363 log
.info("URLstr: " + urlstr
);
365 URL plaziURL
= new URL(urlstr
);
366 BufferedReader in
= new BufferedReader(new InputStreamReader(plaziURL
.openStream()));
369 //TODO lastUpdate field
370 // if(!plaziNotServer){
371 while ((inputLine
= in
.readLine()) != null) {
372 System
.out
.println(inputLine
);
373 if (inputLine
.startsWith("<treatment ")){
374 String taxon
= inputLine
.split("taxon=\"")[1].split("\"")[0];
375 String docID
=inputLine
.split("docId=\"")[1].split("\"")[0];
376 System
.out
.println("docID: "+docID
);
378 String link
=inputLine
.split("link=\"")[1].split("\"")[0];
379 String pageStart
= inputLine
.split("startPage=\"")[1].split("\"")[0];
380 String pageEnd
= inputLine
.split("endPage=\"")[1].split("\"")[0];
381 docList
= documents
.get(docID
);
382 if (docList
== null) {
383 docList
= new ArrayList
<String
>();
385 docList
.add(pageStart
+"---" + pageEnd
+ "---"+taxon
+"---"+link
);
386 documents
.put(docID
,docList
);
390 System
.out
.println("documents created");
399 private static boolean doImportDocument(String document
, int nbtreatments
) {
401 if (nbtreatments
>400) {
404 if (document
.equalsIgnoreCase("1314-2828-2")) { //this is a mix of several publications..
407 if (document
.equalsIgnoreCase("21367")) { //600treatments for ants..
410 if (document
.equalsIgnoreCase("1314-2828-1")) { //900treatments for eupoly..
415 // List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
416 // "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
417 // "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
418 // // "8776-7797-8303"});
419 // if (docDone.contains(document)) {
423 JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
424 JScrollPane scrollPane = new JScrollPane(textArea);
425 textArea.setLineWrap(true);
426 textArea.setWrapStyleWord(true);
427 scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
429 // JFrame frame = new JFrame("I have a question");
430 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
431 int s = JOptionPane.showConfirmDialog(null, scrollPane);
443 private static boolean askIfReuseSecundum() {
444 // logger.info("getFullReference for "+ name);
445 JTextArea textArea
= new JTextArea("Reuse the secundum present in the current classification? " +
446 "\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
447 JScrollPane scrollPane
= new JScrollPane(textArea
);
448 textArea
.setLineWrap(true);
449 textArea
.setWrapStyleWord(true);
450 scrollPane
.setPreferredSize( new Dimension( 700, 70 ) );
452 // JFrame frame = new JFrame("I have a question");
453 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
454 int s
= JOptionPane
.showConfirmDialog(null, scrollPane
);
465 private static Reference
<?
> askForSecundum() {
466 // logger.info("getFullReference for "+ name);
467 JTextArea textArea
= new JTextArea("Enter the secundum name");
468 JScrollPane scrollPane
= new JScrollPane(textArea
);
469 textArea
.setLineWrap(true);
470 textArea
.setWrapStyleWord(true);
471 scrollPane
.setPreferredSize( new Dimension( 700, 100 ) );
473 // JFrame frame = new JFrame("I have a question");
474 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
475 String s
= (String
) JOptionPane
.showInputDialog(
479 JOptionPane
.PLAIN_MESSAGE
,
483 Reference
<?
> ref
= ReferenceFactory
.newGeneric();