1
|
/**
|
2
|
* Copyright (C) 2007 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
|
10
|
package eu.etaxonomy.cdm.app.proibiosphere;
|
11
|
import java.awt.Dimension;
|
12
|
import java.io.BufferedReader;
|
13
|
import java.io.File;
|
14
|
import java.io.FileWriter;
|
15
|
import java.io.IOException;
|
16
|
import java.io.InputStream;
|
17
|
import java.io.InputStreamReader;
|
18
|
import java.net.MalformedURLException;
|
19
|
import java.net.URI;
|
20
|
import java.net.URISyntaxException;
|
21
|
import java.net.URL;
|
22
|
import java.util.ArrayList;
|
23
|
import java.util.Collections;
|
24
|
import java.util.HashMap;
|
25
|
import java.util.HashSet;
|
26
|
import java.util.List;
|
27
|
import java.util.Map;
|
28
|
import java.util.Scanner;
|
29
|
|
30
|
import javax.swing.JOptionPane;
|
31
|
import javax.swing.JScrollPane;
|
32
|
import javax.swing.JTextArea;
|
33
|
import javax.xml.parsers.DocumentBuilder;
|
34
|
import javax.xml.parsers.DocumentBuilderFactory;
|
35
|
|
36
|
import org.apache.log4j.Logger;
|
37
|
import org.w3c.dom.Document;
|
38
|
|
39
|
import eu.etaxonomy.cdm.app.common.CdmDestinations;
|
40
|
import eu.etaxonomy.cdm.database.DbSchemaValidation;
|
41
|
import eu.etaxonomy.cdm.database.ICdmDataSource;
|
42
|
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
|
43
|
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
|
44
|
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
|
45
|
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
|
46
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
47
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
48
|
|
49
|
|
50
|
|
51
|
public class TaxonXImportLauncher {
|
52
|
private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
|
53
|
// private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
|
54
|
|
55
|
//database validation status (create, update, validate ...)
|
56
|
static DbSchemaValidation hbm2dll = DbSchemaValidation.VALIDATE;
|
57
|
static final ICdmDataSource cdmDestination = CdmDestinations.proibiosphere_local();
|
58
|
|
59
|
static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
|
60
|
|
61
|
|
62
|
private static String askQuestion(String question){
|
63
|
Scanner scan = new Scanner(System.in);
|
64
|
System.out.println(question);
|
65
|
String index = scan.nextLine();
|
66
|
return index;
|
67
|
}
|
68
|
|
69
|
public static void main(String[] args) {
|
70
|
|
71
|
String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
|
72
|
String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
|
73
|
// String plaziUrl = "http://plazi.cs.umb.edu/GgServer/xslt/E01DD5BE427421156E0C0BAC56389E0D?xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FLinkers%2FXmlDocumentLinkerData%2Fgg2taxonx.xsl";
|
74
|
List<String> sourcesStr = new ArrayList<String>();
|
75
|
boolean plaziNotServer=false;
|
76
|
|
77
|
Map<String,List<String>> documents = new HashMap<String,List<String>>();
|
78
|
// plaziUrl=plaziUrl+"Chenopodium";
|
79
|
plaziUrl=plaziUrlDoc+"0910-2878-5652";
|
80
|
|
81
|
/*HOW TO HANDLE SECUNDUM REFERENCE*/
|
82
|
boolean reuseSecundum = askIfReuseSecundum();
|
83
|
Reference<?> secundum = null;
|
84
|
if (!reuseSecundum) {
|
85
|
secundum = askForSecundum();
|
86
|
}
|
87
|
|
88
|
String tnomenclature = "ICBN";
|
89
|
URL plaziURL;
|
90
|
try {
|
91
|
plaziURL = new URL(plaziUrl);
|
92
|
BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
|
93
|
|
94
|
List<String> docList;
|
95
|
String inputLine;
|
96
|
String docID;
|
97
|
String pageStart;
|
98
|
String pageEnd;
|
99
|
String taxon;
|
100
|
String link;
|
101
|
//TODO lastUpdate field
|
102
|
if(!plaziNotServer){
|
103
|
while ((inputLine = in.readLine()) != null) {
|
104
|
if (inputLine.startsWith("<treatment ")){
|
105
|
taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
106
|
docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
107
|
link=inputLine.split("link=\"")[1].split("\"")[0];
|
108
|
pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
109
|
pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
110
|
docList = documents.get(docID);
|
111
|
if (docList == null) {
|
112
|
docList = new ArrayList<String>();
|
113
|
}
|
114
|
docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
|
115
|
documents.put(docID,docList);
|
116
|
}
|
117
|
}
|
118
|
}
|
119
|
for (String docId:documents.keySet()){
|
120
|
in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
|
121
|
while ((inputLine = in.readLine()) != null) {
|
122
|
if (inputLine.startsWith("<treatment ")){
|
123
|
taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
|
124
|
docID=inputLine.split("docId=\"")[1].split("\"")[0];
|
125
|
link=inputLine.split("link=\"")[1].split("\"")[0];
|
126
|
pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
|
127
|
pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
|
128
|
docList = documents.get(docID);
|
129
|
if (docList == null) {
|
130
|
docList = new ArrayList<String>();
|
131
|
}
|
132
|
docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
|
133
|
documents.put(docID,docList);
|
134
|
}
|
135
|
}
|
136
|
}
|
137
|
if(plaziNotServer) {
|
138
|
sourcesStr.add(plaziUrl);
|
139
|
}
|
140
|
in.close();
|
141
|
} catch (MalformedURLException e1) {
|
142
|
// TODO Auto-generated catch block
|
143
|
e1.printStackTrace();
|
144
|
} catch (IOException e) {
|
145
|
// TODO Auto-generated catch block
|
146
|
e.printStackTrace();
|
147
|
}
|
148
|
|
149
|
// System.exit(0);
|
150
|
|
151
|
// sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
|
152
|
|
153
|
for (String docId : documents.keySet()){
|
154
|
/*remove documents bad quality*/
|
155
|
log.info(docId);
|
156
|
// if (!docId.equalsIgnoreCase("3891-7797-6564")){
|
157
|
log.info("document "+docId);
|
158
|
List<String> treatments = new ArrayList<String>(new HashSet<String>(documents.get(docId)));
|
159
|
|
160
|
Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
|
161
|
for (String treatment:treatments) {
|
162
|
List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
|
163
|
if (tmplist == null) {
|
164
|
tmplist = new ArrayList<String>();
|
165
|
}
|
166
|
tmplist.add(treatment.split("---")[3]);
|
167
|
startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
|
168
|
}
|
169
|
List<Integer> pages = new ArrayList<Integer>();
|
170
|
pages.addAll(startPages.keySet());
|
171
|
|
172
|
Collections.sort(pages);
|
173
|
// log.info(pages);
|
174
|
|
175
|
log.info("Document "+docId+" should have "+treatments.size()+" treatments");
|
176
|
int cnt=0;
|
177
|
for (String source:treatments){
|
178
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
179
|
DocumentBuilder builder;
|
180
|
URL url;
|
181
|
|
182
|
try {
|
183
|
builder = factory.newDocumentBuilder();
|
184
|
url = new URL(source.split("---")[3]);
|
185
|
Object o = url.getContent();
|
186
|
InputStream is = (InputStream) o;
|
187
|
Document document = builder.parse(is);
|
188
|
cnt++;
|
189
|
}catch(Exception e){
|
190
|
// e.printStackTrace();
|
191
|
log.warn(e);
|
192
|
}
|
193
|
}
|
194
|
log.info("Document "+docId+" has "+cnt+" treatments available");
|
195
|
if(treatments.size() != cnt)
|
196
|
{
|
197
|
File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
|
198
|
FileWriter writer;
|
199
|
try {
|
200
|
writer = new FileWriter(file ,true);
|
201
|
writer.write(docId+"\n");
|
202
|
writer.flush();
|
203
|
writer.close();
|
204
|
} catch (IOException e1) {
|
205
|
// TODO Auto-generated catch block
|
206
|
e1.printStackTrace();
|
207
|
}
|
208
|
|
209
|
}
|
210
|
else{
|
211
|
for (int page:pages) {
|
212
|
for (String treatment: startPages.get(page)) {
|
213
|
sourcesStr.add(treatment);
|
214
|
}
|
215
|
}
|
216
|
}
|
217
|
// }
|
218
|
}
|
219
|
log.info("NB SOURCES : "+sourcesStr.size());
|
220
|
// sourcesStr = new ArrayList<String>();
|
221
|
// sourcesStr.add("http://plazi.cs.umb.edu/exist/rest/db/taxonx_docs/cdmSync/4E7390346C05780D32283CCF6F5E4431_tx.xml");
|
222
|
|
223
|
List<URI> sources = new ArrayList<URI>();
|
224
|
for (String src: sourcesStr){
|
225
|
URI uri;
|
226
|
try {
|
227
|
uri = new URL(src).toURI();
|
228
|
sources.add(new URI(uri.toString()));
|
229
|
} catch (MalformedURLException e1) {
|
230
|
// TODO Auto-generated catch block
|
231
|
e1.printStackTrace();
|
232
|
} catch (URISyntaxException e1) {
|
233
|
// TODO Auto-generated catch block
|
234
|
e1.printStackTrace();
|
235
|
}
|
236
|
}
|
237
|
|
238
|
log.info("Start import from TaxonX Data");
|
239
|
|
240
|
ICdmDataSource destination = cdmDestination;
|
241
|
TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
|
242
|
|
243
|
// taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
|
244
|
taxonxImportConfigurator.setCheck(check);
|
245
|
taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
|
246
|
taxonxImportConfigurator.setDoAutomaticParsing(true);
|
247
|
|
248
|
// invoke import
|
249
|
CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
|
250
|
|
251
|
taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
|
252
|
if (!reuseSecundum) {
|
253
|
taxonxImportConfigurator.setSecundum(secundum);
|
254
|
}
|
255
|
|
256
|
|
257
|
// taxonxImportConfigurator.setDoMatchTaxa(true);
|
258
|
// taxonxImportConfigurator.setReUseTaxon(true);
|
259
|
|
260
|
for (URI source:sources){
|
261
|
log.info("START : "+source.getPath());
|
262
|
taxonxImportConfigurator.setSource(source);
|
263
|
|
264
|
Reference<?> reference = ReferenceFactory.newGeneric();
|
265
|
// String tref = askQuestion("Import source? (ie Plazi document ID)");
|
266
|
String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
|
267
|
reference.setTitleCache(tref,true);
|
268
|
reference.setTitle(tref);
|
269
|
reference.generateTitle();
|
270
|
|
271
|
taxonxImportConfigurator.setSourceReference(reference);
|
272
|
taxonxImportConfigurator.setSourceRef(reference);
|
273
|
|
274
|
// String tnomenclature = askQuestion("ICBN or ICZN ?");
|
275
|
|
276
|
if (tnomenclature.equalsIgnoreCase("ICBN")) {
|
277
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
|
278
|
// taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
|
279
|
}
|
280
|
if(tnomenclature.equalsIgnoreCase("ICZN")){
|
281
|
taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
|
282
|
// taxonxImportConfigurator.setClassificationName("Ants");
|
283
|
}
|
284
|
|
285
|
// taxonxImportConfigurator.setTaxonReference(null);
|
286
|
|
287
|
// log.info("INVOKE");
|
288
|
|
289
|
taxonImport.invoke(taxonxImportConfigurator);
|
290
|
log.info("End import from SpecimenData ("+ source.toString() + ")...");
|
291
|
|
292
|
// //deduplicate
|
293
|
// ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
|
294
|
// int count = app.getAgentService().deduplicate(Person.class, null, null);
|
295
|
// logger.warn("Deduplicated " + count + " persons.");
|
296
|
// count = app.getReferenceService().deduplicate(Reference.class, null, null);
|
297
|
// logger.warn("Deduplicated " + count + " references.");
|
298
|
}
|
299
|
|
300
|
|
301
|
}
|
302
|
|
303
|
|
304
|
|
305
|
/**
|
306
|
* @return
|
307
|
*/
|
308
|
private static boolean askIfReuseSecundum() {
|
309
|
// logger.info("getFullReference for "+ name);
|
310
|
JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
|
311
|
"\n Click Yes to reuse it, click No or Cancel to create a new one.");
|
312
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
313
|
textArea.setLineWrap(true);
|
314
|
textArea.setWrapStyleWord(true);
|
315
|
scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
|
316
|
|
317
|
// JFrame frame = new JFrame("I have a question");
|
318
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
319
|
int s = JOptionPane.showConfirmDialog(null, scrollPane);
|
320
|
if (s==0) {
|
321
|
return true;
|
322
|
} else {
|
323
|
return false;
|
324
|
}
|
325
|
}
|
326
|
|
327
|
/**
|
328
|
* @return
|
329
|
*/
|
330
|
private static Reference<?> askForSecundum() {
|
331
|
// logger.info("getFullReference for "+ name);
|
332
|
JTextArea textArea = new JTextArea("Enter the secundum name");
|
333
|
JScrollPane scrollPane = new JScrollPane(textArea);
|
334
|
textArea.setLineWrap(true);
|
335
|
textArea.setWrapStyleWord(true);
|
336
|
scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
|
337
|
|
338
|
// JFrame frame = new JFrame("I have a question");
|
339
|
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
|
340
|
String s = (String) JOptionPane.showInputDialog(
|
341
|
null,
|
342
|
scrollPane,
|
343
|
"",
|
344
|
JOptionPane.PLAIN_MESSAGE,
|
345
|
null,
|
346
|
null,
|
347
|
null);
|
348
|
Reference<?> ref = ReferenceFactory.newGeneric();
|
349
|
ref.setTitle(s);
|
350
|
return ref;
|
351
|
}
|
352
|
|
353
|
|
354
|
}
|