1
|
/**
|
2
|
* Copyright (C) 2007 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
|
10
|
package eu.etaxonomy.cdm.app.proibiosphere;
|
11
|
import java.awt.Dimension;
|
12
|
import java.io.BufferedReader;
|
13
|
import java.io.IOException;
|
14
|
import java.io.InputStreamReader;
|
15
|
import java.net.MalformedURLException;
|
16
|
import java.net.URI;
|
17
|
import java.net.URISyntaxException;
|
18
|
import java.net.URL;
|
19
|
import java.util.ArrayList;
|
20
|
import java.util.Collections;
|
21
|
import java.util.HashMap;
|
22
|
import java.util.HashSet;
|
23
|
import java.util.List;
|
24
|
import java.util.Map;
|
25
|
import java.util.Scanner;
|
26
|
|
27
|
import javax.swing.JOptionPane;
|
28
|
import javax.swing.JScrollPane;
|
29
|
import javax.swing.JTextArea;
|
30
|
|
31
|
import org.apache.log4j.Logger;
|
32
|
|
33
|
import eu.etaxonomy.cdm.app.common.CdmDestinations;
|
34
|
import eu.etaxonomy.cdm.database.DbSchemaValidation;
|
35
|
import eu.etaxonomy.cdm.database.ICdmDataSource;
|
36
|
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
|
37
|
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
|
38
|
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
|
39
|
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
|
40
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
41
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
42
|
|
43
|
|
44
|
|
45
|
public class TaxonXImportLauncher {
|
46
|
private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
|
47
|
// private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
|
48
|
|
49
|
//database validation status (create, update, validate ...)
|
50
|
static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
|
51
|
// static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
|
52
|
// static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
|
53
|
static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
|
54
|
// static final ICdmDataSource cdmDestination = CdmDestinations.cdm_production_piB("piB_nephrolepis");
|
55
|
// static final ICdmDataSource cdmDestination = CdmDestinations.cdm_local_piB("guianas");
|
56
|
|
57
|
static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
|
58
|
|
59
|
private enum FilterType{MODS, TAXON};
|
60
|
|
61
|
|
62
|
static String plaziUrlTaxName = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
|
63
|
static String plaziUrlModsDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
|
64
|
|
65
|
|
66
|
|
67
|
public static void main(String[] args) {
|
68
|
String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
|
69
|
|
70
|
String[] taxonList = new String[] {"Campylopus"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
|
71
|
// /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071" */};
|
72
|
// String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
|
73
|
// /*auch ants*/ debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
|
74
|
// suite: , };//,"3540555099"};
|
75
|
// modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
|
76
|
// taxonList = spiderModsList;
|
77
|
|
78
|
FilterType filterType = FilterType.TAXON;
|
79
|
|
80
|
NomenclaturalCode tnomenclature = NomenclaturalCode.ICNAFP;
|
81
|
|
82
|
String defaultClassification= null;// "Nephrolepis";
|
83
|
boolean alwaysUseDefaultClassification = false;
|
84
|
|
85
|
boolean useOldUnparsedSynonymExtraction = false;
|
86
|
|
87
|
|
88
|
|
89
|
|
90
|
Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
|
91
|
|
92
|
/*HOW TO HANDLE SECUNDUM REFERENCE*/
|
93
|
boolean reuseSecundum = askIfReuseSecundum();
|
94
|
Reference secundum = null;
|
95
|
if (!reuseSecundum) {
|
96
|
secundum = askForSecundum();
|
97
|
}
|
98
|
|
99
|
loadTreatmentIfPresent(filterType,taxonList, documentMap);
|
100
|
// loadTreatmentIfPresent(FilterType.MODS, modsList, documents,documentMap);
|
101
|
|
102
|
CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
|
103
|
|
104
|
ICdmDataSource destination = cdmDestination;
|
105
|
TaxonXImportConfigurator config = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
|
106
|
config.setUseOldUnparsedSynonymExtraction(useOldUnparsedSynonymExtraction);
|
107
|
|
108
|
config.setImportClassificationName(defaultClassification);
|
109
|
log.info("Start import from TaxonX Data");
|
110
|
|
111
|
config.setLastImport(false);
|
112
|
|
113
|
int j=0;
|
114
|
for (String document : documentMap.keySet()){
|
115
|
j++;
|
116
|
if (doImportDocument(document, documentMap.get(document).size())){
|
117
|
int i=0;
|
118
|
for (URI source: documentMap.get(document)){
|
119
|
System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
|
120
|
i++;
|
121
|
if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
|
122
|
config.setLastImport(true);
|
123
|
}
|
124
|
prepareReferenceAndSource(config,source);
|
125
|
// taxonxImportConfigurator.setTaxonReference(null);
|
126
|
taxonImport.invoke(config);
|
127
|
log.info("End import from SpecimenData ("+ source.toString() + ")...");
|
128
|
|
129
|
// //deduplicate
|
130
|
// ICdmRepository app = taxonImport.getCdmAppController();
|
131
|
// int count = app.getAgentService().deduplicate(Person.class, null, null);
|
132
|
// logger.warn("Deduplicated " + count + " persons.");
|
133
|
// count = app.getReferenceService().deduplicate(Reference.class, null, null);
|
134
|
// logger.warn("Deduplicated " + count + " references.");
|
135
|
}
|
136
|
}
|
137
|
}
|
138
|
}
|
139
|
|
140
|
|
141
|
private static String askQuestion(String question){
|
142
|
Scanner scan = new Scanner(System.in);
|
143
|
System.out.println(question);
|
144
|
String index = scan.nextLine();
|
145
|
return index;
|
146
|
}
|
147
|
|
148
|
/**
|
149
|
* @param taxonxImportConfigurator
|
150
|
* @param source
|
151
|
*
|
152
|
*/
|
153
|
private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
|
154
|
Reference reference = ReferenceFactory.newGeneric();
|
155
|
// String tref = askQuestion("Import source? (ie Plazi document ID)");
|
156
|
String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
|
157
|
reference.setTitleCache(tref,true);
|
158
|
reference.setTitle(tref);
|
159
|
|
160
|
taxonxImportConfigurator.setSourceReference(reference);
|
161
|
TaxonXImportConfigurator.setSourceRef(reference);
|
162
|
|
163
|
Reference referenceUrl = ReferenceFactory.newWebPage();
|
164
|
referenceUrl.setTitleCache(source.toString(), true);
|
165
|
referenceUrl.setTitle(source.toString());
|
166
|
reference.setUri(source);
|
167
|
referenceUrl.generateTitle();
|
168
|
|
169
|
taxonxImportConfigurator.addOriginalSource(referenceUrl);
|
170
|
taxonxImportConfigurator.setSource(source);
|
171
|
}
|
172
|
|
173
|
/**
|
174
|
* @param destination
|
175
|
* @param reuseSecundum
|
176
|
* @param secundum
|
177
|
* @param tnomenclature
|
178
|
* @param alwaysUseDefaultClassification
|
179
|
* @return
|
180
|
*/
|
181
|
private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
|
182
|
TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
|
183
|
|
184
|
//taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
|
185
|
taxonxImportConfigurator.setCheck(check);
|
186
|
taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
|
187
|
taxonxImportConfigurator.setDoAutomaticParsing(true);
|
188
|
|
189
|
taxonxImportConfigurator.setInteractWithUser(true);
|
190
|
taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
|
191
|
|
192
|
taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
|
193
|
|
194
|
taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
|
195
|
if (!reuseSecundum) {
|
196
|
taxonxImportConfigurator.setSecundum(secundum);
|
197
|
}
|
198
|
|
199
|
//taxonxImportConfigurator.setDoMatchTaxa(true);
|
200
|
// taxonxImportConfigurator.setReUseTaxon(true);
|
201
|
return taxonxImportConfigurator;
|
202
|
}
|
203
|
|
204
|
/**
|
205
|
* @param filterType
|
206
|
* @param modsList
|
207
|
* @param documents
|
208
|
* @param documentMap
|
209
|
* @return
|
210
|
*/
|
211
|
private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
|
212
|
|
213
|
Map<String, List<String>> docs = new HashMap<String, List<String>>();
|
214
|
try {
|
215
|
List<String> docList;
|
216
|
String inputLine;
|
217
|
String urlstr="";
|
218
|
|
219
|
Map<String,List<String>> documents = fillDocumentMap(filterType, filterList, urlstr);
|
220
|
|
221
|
// checkTreatmentAvailable(documents, docs);
|
222
|
docs = documents;
|
223
|
|
224
|
} catch (Exception e1) {
|
225
|
e1.printStackTrace();
|
226
|
}
|
227
|
|
228
|
//System.out.println(documents);
|
229
|
for (String docId : docs.keySet()){
|
230
|
List<String> treatments = new ArrayList<String>(new HashSet<String>(docs.get(docId)));
|
231
|
|
232
|
Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
|
233
|
for (String treatment:treatments) {
|
234
|
List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
|
235
|
if (tmplist == null) {
|
236
|
tmplist = new ArrayList<String>();
|
237
|
}
|
238
|
tmplist.add(treatment.split("---")[3]);
|
239
|
startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
|
240
|
}
|
241
|
List<Integer> pages = new ArrayList<Integer>();
|
242
|
pages.addAll(startPages.keySet());
|
243
|
|
244
|
Collections.sort(pages);
|
245
|
// log.info(pages);
|
246
|
|
247
|
log.info("Document "+docId+" should have "+treatments.size()+" treatments");
|
248
|
List<URI> uritmp = documentMap.get(docId);
|
249
|
if (uritmp == null) {
|
250
|
uritmp = new ArrayList<URI>();
|
251
|
}
|
252
|
for (int page:pages) {
|
253
|
for (String treatment: startPages.get(page)) {
|
254
|
try {
|
255
|
uritmp.add(new URL(treatment).toURI());
|
256
|
} catch (MalformedURLException e) {
|
257
|
// TODO Auto-generated catch block
|
258
|
e.printStackTrace();
|
259
|
} catch (URISyntaxException e) {
|
260
|
// TODO Auto-generated catch block
|
261
|
e.printStackTrace();
|
262
|
}
|
263
|
}
|
264
|
}
|
265
|
documentMap.put(docId, uritmp);
|
266
|
}
|
267
|
|
268
|
|
269
|
|
270
|
|
271
|
|
272
|
|
273
|
return documentMap;
|
274
|
|
275
|
}
|
276
|
|
277
|
private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
|
278
|
throws IOException, MalformedURLException {
|
279
|
List<String> docList;
|
280
|
String inputLine;
|
281
|
for (String docId:documents.keySet()){
|
282
|
URL url = new URL(plaziUrlModsDoc+docId);
|
283
|
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
|
284
|
while ((inputLine = in.readLine()) != null) {
|
285
|
if (inputLine.startsWith("<treatment ")){
|
286
|
String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
287
|
String docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
288
|
String link=inputLine.split("link=\"")[1].split("\"")[0];
|
289
|
String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
290
|
String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
291
|
docList = documents.get(docID);
|
292
|
if (docList == null) {
|
293
|
docList = new ArrayList<String>();
|
294
|
}
|
295
|
docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
|
296
|
docs.put(docID,docList);
|
297
|
}
|
298
|
}
|
299
|
}
|
300
|
}
|
301
|
|
302
|
private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
|
303
|
String[] filterList, String urlstr)
|
304
|
throws MalformedURLException, IOException {
|
305
|
|
306
|
Map<String, List<String>> documents = new HashMap<String, List<String>>();
|
307
|
List<String> docList;
|
308
|
String inputLine;
|
309
|
for(String filter : filterList){
|
310
|
// plaziUrl=plaziUrl+"Eupolybothrus";
|
311
|
if (filterType == FilterType.MODS) {
|
312
|
urlstr=plaziUrlModsDoc + filter;
|
313
|
}else if (filterType == FilterType.TAXON) {
|
314
|
urlstr=plaziUrlTaxName + filter;
|
315
|
}
|
316
|
log.info("URLstr: " + urlstr);
|
317
|
|
318
|
URL plaziURL = new URL(urlstr);
|
319
|
BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
|
320
|
|
321
|
|
322
|
//TODO lastUpdate field
|
323
|
// if(!plaziNotServer){
|
324
|
while ((inputLine = in.readLine()) != null) {
|
325
|
System.out.println(inputLine);
|
326
|
if (inputLine.startsWith("<treatment ")){
|
327
|
String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
328
|
String docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
329
|
System.out.println("docID: "+docID);
|
330
|
|
331
|
String link=inputLine.split("link=\"")[1].split("\"")[0];
|
332
|
String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
333
|
String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
334
|
docList = documents.get(docID);
|
335
|
if (docList == null) {
|
336
|
docList = new ArrayList<String>();
|
337
|
}
|
338
|
docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
|
339
|
documents.put(docID,docList);
|
340
|
}
|
341
|
}
|
342
|
}
|
343
|
System.out.println("documents created");
|
344
|
|
345
|
return documents;
|
346
|
}
|
347
|
|
348
|
/**
|
349
|
* @param document
|
350
|
* @return
|
351
|
*/
|
352
|
private static boolean doImportDocument(String document, int nbtreatments) {
|
353
|
|
354
|
if (nbtreatments>400) {
|
355
|
return false;
|
356
|
}
|
357
|
if (document.equalsIgnoreCase("1314-2828-2")) { //this is a mix of several publications..
|
358
|
return false;
|
359
|
}
|
360
|
if (document.equalsIgnoreCase("21367")) { //600treatments for ants..
|
361
|
return false;
|
362
|
}
|
363
|
if (document.equalsIgnoreCase("1314-2828-1")) { //900treatments for eupoly..
|
364
|
return false;
|
365
|
}
|
366
|
return true;
|
367
|
/*
|
368
|
// List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
|
369
|
// "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
|
370
|
// "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
|
371
|
// // "8776-7797-8303"});
|
372
|
// if (docDone.contains(document)) {
|
373
|
// return false;
|
374
|
// }
|
375
|
|
376
|
JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
|
377
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
378
|
textArea.setLineWrap(true);
|
379
|
textArea.setWrapStyleWord(true);
|
380
|
scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
|
381
|
|
382
|
// JFrame frame = new JFrame("I have a question");
|
383
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
384
|
int s = JOptionPane.showConfirmDialog(null, scrollPane);
|
385
|
if (s==0) {
|
386
|
return true;
|
387
|
} else {
|
388
|
return false;
|
389
|
}
|
390
|
*/
|
391
|
}
|
392
|
|
393
|
/**
|
394
|
* @return
|
395
|
*/
|
396
|
private static boolean askIfReuseSecundum() {
|
397
|
// logger.info("getFullReference for "+ name);
|
398
|
JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
|
399
|
"\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
|
400
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
401
|
textArea.setLineWrap(true);
|
402
|
textArea.setWrapStyleWord(true);
|
403
|
scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
|
404
|
|
405
|
// JFrame frame = new JFrame("I have a question");
|
406
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
407
|
int s = JOptionPane.showConfirmDialog(null, scrollPane);
|
408
|
if (s==0) {
|
409
|
return true;
|
410
|
} else {
|
411
|
return false;
|
412
|
}
|
413
|
}
|
414
|
|
415
|
/**
|
416
|
* @return
|
417
|
*/
|
418
|
private static Reference askForSecundum() {
|
419
|
// logger.info("getFullReference for "+ name);
|
420
|
JTextArea textArea = new JTextArea("Enter the secundum name");
|
421
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
422
|
textArea.setLineWrap(true);
|
423
|
textArea.setWrapStyleWord(true);
|
424
|
scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
|
425
|
|
426
|
// JFrame frame = new JFrame("I have a question");
|
427
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
428
|
String s = (String) JOptionPane.showInputDialog(
|
429
|
null,
|
430
|
scrollPane,
|
431
|
"",
|
432
|
JOptionPane.PLAIN_MESSAGE,
|
433
|
null,
|
434
|
null,
|
435
|
null);
|
436
|
Reference ref = ReferenceFactory.newGeneric();
|
437
|
ref.setTitle(s);
|
438
|
return ref;
|
439
|
}
|
440
|
|
441
|
|
442
|
}
|