1
|
/**
|
2
|
* Copyright (C) 2007 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
|
10
|
package eu.etaxonomy.cdm.app.proibiosphere;
|
11
|
import java.awt.Dimension;
|
12
|
import java.io.BufferedReader;
|
13
|
import java.io.File;
|
14
|
import java.io.FileWriter;
|
15
|
import java.io.IOException;
|
16
|
import java.io.InputStream;
|
17
|
import java.io.InputStreamReader;
|
18
|
import java.net.MalformedURLException;
|
19
|
import java.net.URI;
|
20
|
import java.net.URISyntaxException;
|
21
|
import java.net.URL;
|
22
|
import java.util.ArrayList;
|
23
|
import java.util.Collections;
|
24
|
import java.util.HashMap;
|
25
|
import java.util.HashSet;
|
26
|
import java.util.List;
|
27
|
import java.util.Map;
|
28
|
import java.util.Scanner;
|
29
|
|
30
|
import javax.swing.JOptionPane;
|
31
|
import javax.swing.JScrollPane;
|
32
|
import javax.swing.JTextArea;
|
33
|
import javax.xml.parsers.DocumentBuilder;
|
34
|
import javax.xml.parsers.DocumentBuilderFactory;
|
35
|
|
36
|
import org.apache.log4j.Logger;
|
37
|
import org.w3c.dom.Document;
|
38
|
|
39
|
import eu.etaxonomy.cdm.app.common.CdmDestinations;
|
40
|
import eu.etaxonomy.cdm.database.DbSchemaValidation;
|
41
|
import eu.etaxonomy.cdm.database.ICdmDataSource;
|
42
|
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
|
43
|
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
|
44
|
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
|
45
|
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
|
46
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
47
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
48
|
|
49
|
|
50
|
|
51
|
public class TaxonXImportLauncher {
|
52
|
private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
|
53
|
// private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
|
54
|
|
55
|
//database validation status (create, update, validate ...)
|
56
|
static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
|
57
|
static final ICdmDataSource cdmDestination = CdmDestinations.mon_cdm();
|
58
|
|
59
|
static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
|
60
|
|
61
|
|
62
|
static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
|
63
|
static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
|
64
|
|
65
|
|
66
|
private static String askQuestion(String question){
|
67
|
Scanner scan = new Scanner(System.in);
|
68
|
System.out.println(question);
|
69
|
String index = scan.nextLine();
|
70
|
return index;
|
71
|
}
|
72
|
|
73
|
public static void main(String[] args) {
|
74
|
String[] taxonList = new String[] {"Polybothrus","Eupolybothrus"};
|
75
|
// /*ants*/ String[] modsList = new String[] {"3924", "3743", "4375","6757","6752","3481","21401_fisher_smith_plos_2008","2592","4096","6877","6192","8071"};
|
76
|
// String[] modsList = new String[] {"FloNuttDuWin1838"};
|
77
|
// modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
|
78
|
String tnomenclature = "ICZN";
|
79
|
|
80
|
String defaultClassif="Eupolybothrus and Polybothrus";
|
81
|
|
82
|
Map<String,List<String>> documents = new HashMap<String,List<String>>();
|
83
|
HashMap<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
|
84
|
|
85
|
/*HOW TO HANDLE SECUNDUM REFERENCE*/
|
86
|
boolean reuseSecundum = askIfReuseSecundum();
|
87
|
Reference<?> secundum = null;
|
88
|
if (!reuseSecundum) {
|
89
|
secundum = askForSecundum();
|
90
|
}
|
91
|
|
92
|
checkTreatmentPresence("taxon",taxonList, documents,documentMap);
|
93
|
// checkTreatmentPresence("modsid",modsList, documents,documentMap);
|
94
|
|
95
|
TaxonXImportConfigurator taxonxImportConfigurator =null;
|
96
|
CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
|
97
|
|
98
|
ICdmDataSource destination = cdmDestination;
|
99
|
taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum);
|
100
|
|
101
|
taxonxImportConfigurator.setImportClassificationName(defaultClassif);
|
102
|
log.info("Start import from TaxonX Data");
|
103
|
|
104
|
taxonxImportConfigurator.setLastImport(false);
|
105
|
|
106
|
int j=0;
|
107
|
for (String document:documentMap.keySet()){
|
108
|
j++;
|
109
|
if (doImportDocument(document, documentMap.get(document).size())){
|
110
|
int i=0;
|
111
|
for (URI source:documentMap.get(document)){
|
112
|
System.out.println("START "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
|
113
|
i++;
|
114
|
if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
|
115
|
taxonxImportConfigurator.setLastImport(true);
|
116
|
}
|
117
|
prepareReferenceAndSource(taxonxImportConfigurator,source);
|
118
|
prepareNomenclature(taxonxImportConfigurator,tnomenclature);
|
119
|
// taxonxImportConfigurator.setTaxonReference(null);
|
120
|
taxonImport.invoke(taxonxImportConfigurator);
|
121
|
log.info("End import from SpecimenData ("+ source.toString() + ")...");
|
122
|
|
123
|
// //deduplicate
|
124
|
// ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
|
125
|
// int count = app.getAgentService().deduplicate(Person.class, null, null);
|
126
|
// logger.warn("Deduplicated " + count + " persons.");
|
127
|
// count = app.getReferenceService().deduplicate(Reference.class, null, null);
|
128
|
// logger.warn("Deduplicated " + count + " references.");
|
129
|
}
|
130
|
}
|
131
|
}
|
132
|
}
|
133
|
|
134
|
|
135
|
|
136
|
/**
|
137
|
* @param taxonxImportConfigurator
|
138
|
* @param tnomenclature
|
139
|
*/
|
140
|
private static void prepareNomenclature(TaxonXImportConfigurator taxonxImportConfigurator, String tnomenclature) {
|
141
|
// String tnomenclature = askQuestion("ICBN or ICZN ?");
|
142
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
|
143
|
if (tnomenclature.equalsIgnoreCase("ICBN")) {
|
144
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
|
145
|
// taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
|
146
|
}
|
147
|
if(tnomenclature.equalsIgnoreCase("ICZN")){
|
148
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
|
149
|
// taxonxImportConfigurator.setClassificationName("Ants");
|
150
|
}
|
151
|
if(tnomenclature.equalsIgnoreCase("ICNB")){
|
152
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNB);
|
153
|
// taxonxImportConfigurator.setClassificationName("Bacteria");
|
154
|
}
|
155
|
|
156
|
}
|
157
|
|
158
|
/**
|
159
|
* @param taxonxImportConfigurator
|
160
|
* @param source
|
161
|
*
|
162
|
*/
|
163
|
private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
|
164
|
Reference<?> reference = ReferenceFactory.newGeneric();
|
165
|
// String tref = askQuestion("Import source? (ie Plazi document ID)");
|
166
|
String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
|
167
|
reference.setTitleCache(tref,true);
|
168
|
reference.setTitle(tref);
|
169
|
reference.generateTitle();
|
170
|
|
171
|
taxonxImportConfigurator.setSourceReference(reference);
|
172
|
TaxonXImportConfigurator.setSourceRef(reference);
|
173
|
|
174
|
Reference<?> referenceUrl = ReferenceFactory.newWebPage();
|
175
|
referenceUrl.setTitleCache(source.toString(), true);
|
176
|
referenceUrl.setTitle(source.toString());
|
177
|
reference.setUri(source);
|
178
|
referenceUrl.generateTitle();
|
179
|
|
180
|
taxonxImportConfigurator.addOriginalSource(referenceUrl);
|
181
|
taxonxImportConfigurator.setSource(source);
|
182
|
}
|
183
|
|
184
|
/**
|
185
|
* @param destination
|
186
|
* @param reuseSecundum
|
187
|
* @param secundum
|
188
|
* @return
|
189
|
*/
|
190
|
private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum) {
|
191
|
TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
|
192
|
|
193
|
// taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
|
194
|
taxonxImportConfigurator.setCheck(check);
|
195
|
taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
|
196
|
taxonxImportConfigurator.setDoAutomaticParsing(true);
|
197
|
|
198
|
taxonxImportConfigurator.setInteractWithUser(true);
|
199
|
|
200
|
|
201
|
taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
|
202
|
if (!reuseSecundum) {
|
203
|
taxonxImportConfigurator.setSecundum(secundum);
|
204
|
}
|
205
|
|
206
|
// taxonxImportConfigurator.setDoMatchTaxa(true);
|
207
|
// taxonxImportConfigurator.setReUseTaxon(true);
|
208
|
return taxonxImportConfigurator;
|
209
|
}
|
210
|
|
211
|
/**
|
212
|
* @param importFilter
|
213
|
* @param modsList
|
214
|
* @param documents
|
215
|
* @param documentMap
|
216
|
* @return
|
217
|
*/
|
218
|
private static HashMap<String, List<URI>> checkTreatmentPresence(String importFilter, String[] modsList, Map<String, List<String>> documents, HashMap<String, List<URI>> documentMap) {
|
219
|
URL plaziURL;
|
220
|
// System.out.println(plaziUrl);
|
221
|
|
222
|
Map<String, List<String>> docs = new HashMap<String, List<String>>();
|
223
|
try {
|
224
|
BufferedReader in=null;
|
225
|
List<String> docList;
|
226
|
String inputLine;
|
227
|
String docID;
|
228
|
String pageStart;
|
229
|
String pageEnd;
|
230
|
String taxon;
|
231
|
String link;
|
232
|
String urlstr="";
|
233
|
|
234
|
for(String modsID : modsList){
|
235
|
// plaziUrl=plaziUrl+"Eupolybothrus";
|
236
|
if (importFilter.equalsIgnoreCase("modsid")) {
|
237
|
urlstr=plaziUrlDoc+modsID;
|
238
|
}
|
239
|
if (importFilter.equalsIgnoreCase("taxon")) {
|
240
|
urlstr=plaziUrl+modsID;
|
241
|
}
|
242
|
// System.out.println(url);
|
243
|
|
244
|
plaziURL = new URL(urlstr);
|
245
|
in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
|
246
|
|
247
|
|
248
|
//TODO lastUpdate field
|
249
|
// if(!plaziNotServer){
|
250
|
while ((inputLine = in.readLine()) != null) {
|
251
|
System.out.println(inputLine);
|
252
|
if (inputLine.startsWith("<treatment ")){
|
253
|
taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
254
|
docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
255
|
System.out.println("docID: "+docID);
|
256
|
link=inputLine.split("link=\"")[1].split("\"")[0];
|
257
|
pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
258
|
pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
259
|
docList = documents.get(docID);
|
260
|
if (docList == null) {
|
261
|
docList = new ArrayList<String>();
|
262
|
}
|
263
|
docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
|
264
|
documents.put(docID,docList);
|
265
|
}
|
266
|
}
|
267
|
}
|
268
|
System.out.println("hop");
|
269
|
|
270
|
|
271
|
|
272
|
for (String docId:documents.keySet()){
|
273
|
in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
|
274
|
while ((inputLine = in.readLine()) != null) {
|
275
|
if (inputLine.startsWith("<treatment ")){
|
276
|
taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
277
|
docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
278
|
link=inputLine.split("link=\"")[1].split("\"")[0];
|
279
|
pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
280
|
pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
281
|
docList = documents.get(docID);
|
282
|
if (docList == null) {
|
283
|
docList = new ArrayList<String>();
|
284
|
}
|
285
|
docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
|
286
|
docs.put(docID,docList);
|
287
|
}
|
288
|
}
|
289
|
}
|
290
|
// if(plaziNotServer) {
|
291
|
// sourcesStr.add(plaziUrl);
|
292
|
// }
|
293
|
// in.close();
|
294
|
} catch (MalformedURLException e1) {
|
295
|
// TODO Auto-generated catch block
|
296
|
e1.printStackTrace();
|
297
|
} catch (IOException e) {
|
298
|
// TODO Auto-generated catch block
|
299
|
e.printStackTrace();
|
300
|
}
|
301
|
|
302
|
// System.exit(0);
|
303
|
|
304
|
// sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
|
305
|
|
306
|
//System.out.println(documents);
|
307
|
for (String docId : docs.keySet()){
|
308
|
List<String> treatments = new ArrayList<String>(new HashSet<String>(docs.get(docId)));
|
309
|
|
310
|
Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
|
311
|
for (String treatment:treatments) {
|
312
|
List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
|
313
|
if (tmplist == null) {
|
314
|
tmplist = new ArrayList<String>();
|
315
|
}
|
316
|
tmplist.add(treatment.split("---")[3]);
|
317
|
startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
|
318
|
}
|
319
|
List<Integer> pages = new ArrayList<Integer>();
|
320
|
pages.addAll(startPages.keySet());
|
321
|
|
322
|
Collections.sort(pages);
|
323
|
// log.info(pages);
|
324
|
|
325
|
log.info("Document "+docId+" should have "+treatments.size()+" treatments");
|
326
|
int cnt=0;
|
327
|
if(treatments.size()<150){
|
328
|
|
329
|
for (String source:treatments){
|
330
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
331
|
DocumentBuilder builder;
|
332
|
URL url;
|
333
|
|
334
|
try {
|
335
|
builder = factory.newDocumentBuilder();
|
336
|
url = new URL(source.split("---")[3]);
|
337
|
Object o = url.getContent();
|
338
|
InputStream is = (InputStream) o;
|
339
|
Document document = builder.parse(is);
|
340
|
cnt++;
|
341
|
}catch(Exception e){
|
342
|
// e.printStackTrace();
|
343
|
log.warn(e);
|
344
|
}
|
345
|
}
|
346
|
log.info("Document "+docId+" has "+cnt+" treatments available");
|
347
|
}
|
348
|
if(treatments.size() != cnt)
|
349
|
{
|
350
|
File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
|
351
|
FileWriter writer;
|
352
|
try {
|
353
|
writer = new FileWriter(file ,true);
|
354
|
writer.write(docId+"\n");
|
355
|
writer.flush();
|
356
|
writer.close();
|
357
|
} catch (IOException e1) {
|
358
|
// TODO Auto-generated catch block
|
359
|
e1.printStackTrace();
|
360
|
}
|
361
|
|
362
|
}
|
363
|
else{
|
364
|
List<URI> uritmp = documentMap.get(docId);
|
365
|
if (uritmp == null) {
|
366
|
uritmp = new ArrayList<URI>();
|
367
|
}
|
368
|
for (int page:pages) {
|
369
|
for (String treatment: startPages.get(page)) {
|
370
|
try {
|
371
|
uritmp.add(new URL(treatment).toURI());
|
372
|
} catch (MalformedURLException e) {
|
373
|
// TODO Auto-generated catch block
|
374
|
e.printStackTrace();
|
375
|
} catch (URISyntaxException e) {
|
376
|
// TODO Auto-generated catch block
|
377
|
e.printStackTrace();
|
378
|
}
|
379
|
}
|
380
|
}
|
381
|
documentMap.put(docId, uritmp);
|
382
|
}
|
383
|
|
384
|
|
385
|
|
386
|
|
387
|
}
|
388
|
////// log.info("NB SOURCES : "+sourcesStr.size());
|
389
|
// List<URI> sourcesStr = new ArrayList<URI>();
|
390
|
// try {
|
391
|
//// documentMap = new HashMap<String, List<URI>>();
|
392
|
// sourcesStr.add(new URI("http://plazi.cs.umb.edu/GgServer/cdmSync/8F5B3EA099D371BC41CC5DDBFEDCFBED"));
|
393
|
// documentMap.put("singlesource", sourcesStr);
|
394
|
// } catch (URISyntaxException e) {
|
395
|
// // TODO Auto-generated catch block
|
396
|
// e.printStackTrace();
|
397
|
// }
|
398
|
|
399
|
return documentMap;
|
400
|
|
401
|
}
|
402
|
|
403
|
/**
|
404
|
* @param document
|
405
|
* @return
|
406
|
*/
|
407
|
private static boolean doImportDocument(String document, int nbtreatments) {
|
408
|
return true;
|
409
|
/*
|
410
|
// List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
|
411
|
// "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
|
412
|
// "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
|
413
|
// // "8776-7797-8303"});
|
414
|
// if (docDone.contains(document)) {
|
415
|
// return false;
|
416
|
// }
|
417
|
|
418
|
JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
|
419
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
420
|
textArea.setLineWrap(true);
|
421
|
textArea.setWrapStyleWord(true);
|
422
|
scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
|
423
|
|
424
|
// JFrame frame = new JFrame("I have a question");
|
425
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
426
|
int s = JOptionPane.showConfirmDialog(null, scrollPane);
|
427
|
if (s==0) {
|
428
|
return true;
|
429
|
} else {
|
430
|
return false;
|
431
|
}
|
432
|
*/
|
433
|
}
|
434
|
|
435
|
/**
|
436
|
* @return
|
437
|
*/
|
438
|
private static boolean askIfReuseSecundum() {
|
439
|
// logger.info("getFullReference for "+ name);
|
440
|
JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
|
441
|
"\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
|
442
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
443
|
textArea.setLineWrap(true);
|
444
|
textArea.setWrapStyleWord(true);
|
445
|
scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
|
446
|
|
447
|
// JFrame frame = new JFrame("I have a question");
|
448
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
449
|
int s = JOptionPane.showConfirmDialog(null, scrollPane);
|
450
|
if (s==0) {
|
451
|
return true;
|
452
|
} else {
|
453
|
return false;
|
454
|
}
|
455
|
}
|
456
|
|
457
|
/**
|
458
|
* @return
|
459
|
*/
|
460
|
private static Reference<?> askForSecundum() {
|
461
|
// logger.info("getFullReference for "+ name);
|
462
|
JTextArea textArea = new JTextArea("Enter the secundum name");
|
463
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
464
|
textArea.setLineWrap(true);
|
465
|
textArea.setWrapStyleWord(true);
|
466
|
scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
|
467
|
|
468
|
// JFrame frame = new JFrame("I have a question");
|
469
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
470
|
String s = (String) JOptionPane.showInputDialog(
|
471
|
null,
|
472
|
scrollPane,
|
473
|
"",
|
474
|
JOptionPane.PLAIN_MESSAGE,
|
475
|
null,
|
476
|
null,
|
477
|
null);
|
478
|
Reference<?> ref = ReferenceFactory.newGeneric();
|
479
|
ref.setTitle(s);
|
480
|
return ref;
|
481
|
}
|
482
|
|
483
|
|
484
|
}
|