e2f8fed54bb592c5e7d2487c9bb2477d7cc2fc65
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / proibiosphere / TaxonXImportLauncher.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.app.proibiosphere;
11 import java.awt.Dimension;
12 import java.io.BufferedReader;
13 import java.io.IOException;
14 import java.io.InputStreamReader;
15 import java.net.MalformedURLException;
16 import java.net.URI;
17 import java.net.URISyntaxException;
18 import java.net.URL;
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Scanner;
26
27 import javax.swing.JOptionPane;
28 import javax.swing.JScrollPane;
29 import javax.swing.JTextArea;
30
31 import org.apache.log4j.Logger;
32
33 import eu.etaxonomy.cdm.app.common.CdmDestinations;
34 import eu.etaxonomy.cdm.database.DbSchemaValidation;
35 import eu.etaxonomy.cdm.database.ICdmDataSource;
36 import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
37 import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
38 import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
39 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
40 import eu.etaxonomy.cdm.model.reference.Reference;
41 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
42
43
44
45 public class TaxonXImportLauncher {
46 private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
47 // private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
48
49 //database validation status (create, update, validate ...)
50 static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
51 // static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
52 static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
53 // static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
54
55 static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
56
57 private enum FilterType{MODS, TAXON};
58
59
60 static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
61 static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
62
63
64
65 public static void main(String[] args) {
66 String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
67
68 String[] taxonList = new String[] {"Comaroma"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
69 // /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071" */};
70 // String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
71 // /*auch ants*/ debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
72 // suite: , };//,"3540555099"};
73 // modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
74 taxonList = spiderModsList;
75
76 FilterType filterType = FilterType.MODS;
77
78 NomenclaturalCode tnomenclature = NomenclaturalCode.ICZN;
79
80 String defaultClassification="Spiders";
81 boolean alwaysUseDefaultClassification = true;
82
83
84
85
86 Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
87
88 /*HOW TO HANDLE SECUNDUM REFERENCE*/
89 boolean reuseSecundum = askIfReuseSecundum();
90 Reference<?> secundum = null;
91 if (!reuseSecundum) {
92 secundum = askForSecundum();
93 }
94
95 loadTreatmentIfPresent(filterType,taxonList, documentMap);
96 // loadTreatmentIfPresent(FilterType.MODS,modsList, documents,documentMap);
97
98 TaxonXImportConfigurator taxonxImportConfigurator =null;
99 CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
100
101 ICdmDataSource destination = cdmDestination;
102 taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
103
104 taxonxImportConfigurator.setImportClassificationName(defaultClassification);
105 log.info("Start import from TaxonX Data");
106
107 taxonxImportConfigurator.setLastImport(false);
108
109 int j=0;
110 for (String document : documentMap.keySet()){
111 j++;
112 if (doImportDocument(document, documentMap.get(document).size())){
113 int i=0;
114 for (URI source: documentMap.get(document)){
115 System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
116 i++;
117 if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
118 taxonxImportConfigurator.setLastImport(true);
119 }
120 prepareReferenceAndSource(taxonxImportConfigurator,source);
121 // taxonxImportConfigurator.setTaxonReference(null);
122 taxonImport.invoke(taxonxImportConfigurator);
123 log.info("End import from SpecimenData ("+ source.toString() + ")...");
124
125 // //deduplicate
126 // ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
127 // int count = app.getAgentService().deduplicate(Person.class, null, null);
128 // logger.warn("Deduplicated " + count + " persons.");
129 // count = app.getReferenceService().deduplicate(Reference.class, null, null);
130 // logger.warn("Deduplicated " + count + " references.");
131 }
132 }
133 }
134 }
135
136
137 private static String askQuestion(String question){
138 Scanner scan = new Scanner(System.in);
139 System.out.println(question);
140 String index = scan.nextLine();
141 return index;
142 }
143
144 /**
145 * @param taxonxImportConfigurator
146 * @param source
147 *
148 */
149 private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
150 Reference<?> reference = ReferenceFactory.newGeneric();
151 // String tref = askQuestion("Import source? (ie Plazi document ID)");
152 String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
153 reference.setTitleCache(tref,true);
154 reference.setTitle(tref);
155
156 taxonxImportConfigurator.setSourceReference(reference);
157 TaxonXImportConfigurator.setSourceRef(reference);
158
159 Reference<?> referenceUrl = ReferenceFactory.newWebPage();
160 referenceUrl.setTitleCache(source.toString(), true);
161 referenceUrl.setTitle(source.toString());
162 reference.setUri(source);
163 referenceUrl.generateTitle();
164
165 taxonxImportConfigurator.addOriginalSource(referenceUrl);
166 taxonxImportConfigurator.setSource(source);
167 }
168
169 /**
170 * @param destination
171 * @param reuseSecundum
172 * @param secundum
173 * @param tnomenclature
174 * @param alwaysUseDefaultClassification
175 * @return
176 */
177 private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
178 TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
179
180 //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
181 taxonxImportConfigurator.setCheck(check);
182 taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
183 taxonxImportConfigurator.setDoAutomaticParsing(true);
184
185 taxonxImportConfigurator.setInteractWithUser(true);
186 taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
187
188 taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
189
190 taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
191 if (!reuseSecundum) {
192 taxonxImportConfigurator.setSecundum(secundum);
193 }
194
195 //taxonxImportConfigurator.setDoMatchTaxa(true);
196 // taxonxImportConfigurator.setReUseTaxon(true);
197 return taxonxImportConfigurator;
198 }
199
200 /**
201 * @param filterType
202 * @param modsList
203 * @param documents
204 * @param documentMap
205 * @return
206 */
207 private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
208
209 Map<String, List<String>> docs = new HashMap<String, List<String>>();
210 try {
211 List<String> docList;
212 String inputLine;
213 String urlstr="";
214
215 Map<String,List<String>> documents = fillDocumentMap(filterType, filterList, urlstr);
216
217 // checkTreatmentAvailable(documents, docs);
218 docs = documents;
219
220 } catch (Exception e1) {
221 e1.printStackTrace();
222 }
223
224 // sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
225
226 //System.out.println(documents);
227 for (String docId : docs.keySet()){
228 List<String> treatments = new ArrayList<String>(new HashSet<String>(docs.get(docId)));
229
230 Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
231 for (String treatment:treatments) {
232 List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
233 if (tmplist == null) {
234 tmplist = new ArrayList<String>();
235 }
236 tmplist.add(treatment.split("---")[3]);
237 startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
238 }
239 List<Integer> pages = new ArrayList<Integer>();
240 pages.addAll(startPages.keySet());
241
242 Collections.sort(pages);
243 // log.info(pages);
244
245 log.info("Document "+docId+" should have "+treatments.size()+" treatments");
246 //don't test if all the treatments are really online, it should be working without problems now
247 // int cnt=0;
248 // if(treatments.size()<150){
249 //
250 // for (String source:treatments){
251 // DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
252 // DocumentBuilder builder;
253 // URL url;
254 //
255 // try {
256 // builder = factory.newDocumentBuilder();
257 // url = new URL(source.split("---")[3]);
258 // Object o = url.getContent();
259 // InputStream is = (InputStream) o;
260 // Document document = builder.parse(is);
261 // cnt++;
262 // }catch(Exception e){
263 // // e.printStackTrace();
264 // log.warn(e);
265 // }
266 // }
267 // log.info("Document "+docId+" has "+cnt+" treatments available");
268 // }
269 // if(treatments.size() != cnt)
270 // {
271 // File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
272 // FileWriter writer;
273 // try {
274 // writer = new FileWriter(file ,true);
275 // writer.write(docId+"\n");
276 // writer.flush();
277 // writer.close();
278 // } catch (IOException e1) {
279 // // TODO Auto-generated catch block
280 // e1.printStackTrace();
281 // }
282 //
283 // }
284 // else{
285 List<URI> uritmp = documentMap.get(docId);
286 if (uritmp == null) {
287 uritmp = new ArrayList<URI>();
288 }
289 for (int page:pages) {
290 for (String treatment: startPages.get(page)) {
291 try {
292 uritmp.add(new URL(treatment).toURI());
293 } catch (MalformedURLException e) {
294 // TODO Auto-generated catch block
295 e.printStackTrace();
296 } catch (URISyntaxException e) {
297 // TODO Auto-generated catch block
298 e.printStackTrace();
299 }
300 }
301 }
302 documentMap.put(docId, uritmp);
303 }
304
305
306
307
308 // }
309 ////// log.info("NB SOURCES : "+sourcesStr.size());
310 // List<URI> sourcesStr = new ArrayList<URI>();
311 // try {
312 //// documentMap = new HashMap<String, List<URI>>();
313 // sourcesStr.add(new URI("http://plazi.cs.umb.edu/GgServer/cdmSync/8F5B3EA099D371BC41CC5DDBFEDCFBED"));
314 // documentMap.put("singlesource", sourcesStr);
315 // } catch (URISyntaxException e) {
316 // // TODO Auto-generated catch block
317 // e.printStackTrace();
318 // }
319
320 return documentMap;
321
322 }
323
324 private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
325 throws IOException, MalformedURLException {
326 List<String> docList;
327 String inputLine;
328 for (String docId:documents.keySet()){
329 URL url = new URL(plaziUrlDoc+docId);
330 BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
331 while ((inputLine = in.readLine()) != null) {
332 if (inputLine.startsWith("<treatment ")){
333 String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
334 String docID=inputLine.split("docId=\"")[1].split("\"")[0];
335 String link=inputLine.split("link=\"")[1].split("\"")[0];
336 String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
337 String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
338 docList = documents.get(docID);
339 if (docList == null) {
340 docList = new ArrayList<String>();
341 }
342 docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
343 docs.put(docID,docList);
344 }
345 }
346 }
347 }
348
349 private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
350 String[] filterList, String urlstr)
351 throws MalformedURLException, IOException {
352
353 Map<String, List<String>> documents = new HashMap<String, List<String>>();
354 List<String> docList;
355 String inputLine;
356 for(String filter : filterList){
357 // plaziUrl=plaziUrl+"Eupolybothrus";
358 if (filterType == FilterType.MODS) {
359 urlstr=plaziUrlDoc + filter;
360 }else if (filterType == FilterType.TAXON) {
361 urlstr=plaziUrl + filter;
362 }
363 log.info("URLstr: " + urlstr);
364
365 URL plaziURL = new URL(urlstr);
366 BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
367
368
369 //TODO lastUpdate field
370 // if(!plaziNotServer){
371 while ((inputLine = in.readLine()) != null) {
372 System.out.println(inputLine);
373 if (inputLine.startsWith("<treatment ")){
374 String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
375 String docID=inputLine.split("docId=\"")[1].split("\"")[0];
376 System.out.println("docID: "+docID);
377
378 String link=inputLine.split("link=\"")[1].split("\"")[0];
379 String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
380 String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
381 docList = documents.get(docID);
382 if (docList == null) {
383 docList = new ArrayList<String>();
384 }
385 docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
386 documents.put(docID,docList);
387 }
388 }
389 }
390 System.out.println("documents created");
391
392 return documents;
393 }
394
395 /**
396 * @param document
397 * @return
398 */
399 private static boolean doImportDocument(String document, int nbtreatments) {
400
401 if (nbtreatments>400) {
402 return false;
403 }
404 if (document.equalsIgnoreCase("1314-2828-2")) { //this is a mix of several publications..
405 return false;
406 }
407 if (document.equalsIgnoreCase("21367")) { //600treatments for ants..
408 return false;
409 }
410 if (document.equalsIgnoreCase("1314-2828-1")) { //900treatments for eupoly..
411 return false;
412 }
413 return true;
414 /*
415 // List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
416 // "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
417 // "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
418 // // "8776-7797-8303"});
419 // if (docDone.contains(document)) {
420 // return false;
421 // }
422
423 JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
424 JScrollPane scrollPane = new JScrollPane(textArea);
425 textArea.setLineWrap(true);
426 textArea.setWrapStyleWord(true);
427 scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
428
429 // JFrame frame = new JFrame("I have a question");
430 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
431 int s = JOptionPane.showConfirmDialog(null, scrollPane);
432 if (s==0) {
433 return true;
434 } else {
435 return false;
436 }
437 */
438 }
439
440 /**
441 * @return
442 */
443 private static boolean askIfReuseSecundum() {
444 // logger.info("getFullReference for "+ name);
445 JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
446 "\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
447 JScrollPane scrollPane = new JScrollPane(textArea);
448 textArea.setLineWrap(true);
449 textArea.setWrapStyleWord(true);
450 scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
451
452 // JFrame frame = new JFrame("I have a question");
453 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
454 int s = JOptionPane.showConfirmDialog(null, scrollPane);
455 if (s==0) {
456 return true;
457 } else {
458 return false;
459 }
460 }
461
462 /**
463 * @return
464 */
465 private static Reference<?> askForSecundum() {
466 // logger.info("getFullReference for "+ name);
467 JTextArea textArea = new JTextArea("Enter the secundum name");
468 JScrollPane scrollPane = new JScrollPane(textArea);
469 textArea.setLineWrap(true);
470 textArea.setWrapStyleWord(true);
471 scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
472
473 // JFrame frame = new JFrame("I have a question");
474 // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
475 String s = (String) JOptionPane.showInputDialog(
476 null,
477 scrollPane,
478 "",
479 JOptionPane.PLAIN_MESSAGE,
480 null,
481 null,
482 null);
483 Reference<?> ref = ReferenceFactory.newGeneric();
484 ref.setTitle(s);
485 return ref;
486 }
487
488
489 }