Project

General

Profile

Download (19.9 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.app.proibiosphere;
11
import java.awt.Dimension;
12
import java.io.BufferedReader;
13
import java.io.File;
14
import java.io.FileWriter;
15
import java.io.IOException;
16
import java.io.InputStream;
17
import java.io.InputStreamReader;
18
import java.net.MalformedURLException;
19
import java.net.URI;
20
import java.net.URISyntaxException;
21
import java.net.URL;
22
import java.util.ArrayList;
23
import java.util.Collections;
24
import java.util.HashMap;
25
import java.util.HashSet;
26
import java.util.List;
27
import java.util.Map;
28
import java.util.Scanner;
29

    
30
import javax.swing.JOptionPane;
31
import javax.swing.JScrollPane;
32
import javax.swing.JTextArea;
33
import javax.xml.parsers.DocumentBuilder;
34
import javax.xml.parsers.DocumentBuilderFactory;
35

    
36
import org.apache.log4j.Logger;
37
import org.w3c.dom.Document;
38

    
39
import eu.etaxonomy.cdm.app.common.CdmDestinations;
40
import eu.etaxonomy.cdm.database.DbSchemaValidation;
41
import eu.etaxonomy.cdm.database.ICdmDataSource;
42
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
43
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
44
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
45
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
46
import eu.etaxonomy.cdm.model.reference.Reference;
47
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
48

    
49

    
50

    
51
public class TaxonXImportLauncher {
52
    private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
53
    //    private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
54

    
55
    //database validation status (create, update, validate ...)
56
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
57
    static final ICdmDataSource cdmDestination = CdmDestinations.mon_cdm();
58

    
59
    static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
60

    
61

    
62
    static String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
63
    static String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
64

    
65

    
66
    private static String askQuestion(String question){
67
        Scanner scan = new Scanner(System.in);
68
        System.out.println(question);
69
        String index = scan.nextLine();
70
        return index;
71
    }
72

    
73
    public static void main(String[] args) {
74
        String[] taxonList = new String[] {"Polybothrus","Eupolybothrus"};
75
//       /*ants*/ String[] modsList = new String[] {"3924", "3743", "4375","6757","6752","3481","21401_fisher_smith_plos_2008","2592","4096","6877","6192","8071"};
76
//        String[] modsList = new String[] {"FloNuttDuWin1838"};
77
//        modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
78
        String tnomenclature = "ICZN";
79

    
80
        String defaultClassif="Eupolybothrus and Polybothrus";
81

    
82
        Map<String,List<String>> documents = new HashMap<String,List<String>>();
83
        HashMap<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
84

    
85
        /*HOW TO HANDLE SECUNDUM REFERENCE*/
86
        boolean reuseSecundum = askIfReuseSecundum();
87
        Reference<?> secundum = null;
88
        if (!reuseSecundum) {
89
            secundum = askForSecundum();
90
        }
91

    
92
        checkTreatmentPresence("taxon",taxonList, documents,documentMap);
93
//        checkTreatmentPresence("modsid",modsList, documents,documentMap);
94

    
95
        TaxonXImportConfigurator taxonxImportConfigurator =null;
96
        CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
97

    
98
        ICdmDataSource destination = cdmDestination;
99
        taxonxImportConfigurator = prepareTaxonXImport(destination,reuseSecundum, secundum);
100

    
101
        taxonxImportConfigurator.setImportClassificationName(defaultClassif);
102
        log.info("Start import from  TaxonX Data");
103

    
104
        taxonxImportConfigurator.setLastImport(false);
105

    
106
        int j=0;
107
        for (String document:documentMap.keySet()){
108
            j++;
109
            if (doImportDocument(document, documentMap.get(document).size())){
110
                int i=0;
111
                for (URI source:documentMap.get(document)){
112
                    System.out.println("START "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
113
                    i++;
114
                    if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
115
                        taxonxImportConfigurator.setLastImport(true);
116
                    }
117
                        prepareReferenceAndSource(taxonxImportConfigurator,source);
118
                    prepareNomenclature(taxonxImportConfigurator,tnomenclature);
119
                    //   taxonxImportConfigurator.setTaxonReference(null);
120
                    taxonImport.invoke(taxonxImportConfigurator);
121
                    log.info("End import from SpecimenData ("+ source.toString() + ")...");
122

    
123
                    //          //deduplicate
124
                    //            ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
125
                    //            int count = app.getAgentService().deduplicate(Person.class, null, null);
126
                    //            logger.warn("Deduplicated " + count + " persons.");
127
                    //            count = app.getReferenceService().deduplicate(Reference.class, null, null);
128
                    //            logger.warn("Deduplicated " + count + " references.");
129
                }
130
            }
131
        }
132
    }
133

    
134

    
135

    
136
    /**
137
     * @param taxonxImportConfigurator
138
     * @param tnomenclature
139
     */
140
    private static void prepareNomenclature(TaxonXImportConfigurator taxonxImportConfigurator, String tnomenclature) {
141
        //            String tnomenclature = askQuestion("ICBN or ICZN ?");
142
        taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
143
        if (tnomenclature.equalsIgnoreCase("ICBN")) {
144
            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
145
            //                taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
146
        }
147
        if(tnomenclature.equalsIgnoreCase("ICZN")){
148
            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
149
            //                taxonxImportConfigurator.setClassificationName("Ants");
150
        }
151
        if(tnomenclature.equalsIgnoreCase("ICNB")){
152
            taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNB);
153
            //                taxonxImportConfigurator.setClassificationName("Bacteria");
154
        }
155

    
156
    }
157

    
158
    /**
159
     * @param taxonxImportConfigurator
160
     * @param source
161
     *
162
     */
163
    private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
164
        Reference<?> reference = ReferenceFactory.newGeneric();
165
        //            String tref = askQuestion("Import source? (ie Plazi document ID)");
166
        String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
167
        reference.setTitleCache(tref,true);
168
        reference.setTitle(tref);
169
        reference.generateTitle();
170

    
171
        taxonxImportConfigurator.setSourceReference(reference);
172
        TaxonXImportConfigurator.setSourceRef(reference);
173

    
174
        Reference<?> referenceUrl = ReferenceFactory.newWebPage();
175
        referenceUrl.setTitleCache(source.toString(), true);
176
        referenceUrl.setTitle(source.toString());
177
        reference.setUri(source);
178
        referenceUrl.generateTitle();
179

    
180
        taxonxImportConfigurator.addOriginalSource(referenceUrl);
181
        taxonxImportConfigurator.setSource(source);
182
    }
183

    
184
    /**
185
     * @param destination
186
     * @param reuseSecundum
187
     * @param secundum
188
     * @return
189
     */
190
    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum) {
191
        TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
192

    
193
        //        taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
194
        taxonxImportConfigurator.setCheck(check);
195
        taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
196
        taxonxImportConfigurator.setDoAutomaticParsing(true);
197

    
198
        taxonxImportConfigurator.setInteractWithUser(true);
199

    
200

    
201
        taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
202
        if (!reuseSecundum) {
203
            taxonxImportConfigurator.setSecundum(secundum);
204
        }
205

    
206
        //        taxonxImportConfigurator.setDoMatchTaxa(true);
207
        //        taxonxImportConfigurator.setReUseTaxon(true);
208
        return taxonxImportConfigurator;
209
    }
210

    
211
    /**
212
     * @param importFilter
213
     * @param modsList
214
     * @param documents
215
     * @param documentMap
216
     * @return
217
     */
218
    private static HashMap<String, List<URI>> checkTreatmentPresence(String importFilter, String[] modsList, Map<String, List<String>> documents, HashMap<String, List<URI>> documentMap) {
219
        URL plaziURL;
220
        //        System.out.println(plaziUrl);
221

    
222
        Map<String, List<String>> docs = new HashMap<String, List<String>>();
223
        try {
224
            BufferedReader in=null;
225
            List<String> docList;
226
            String inputLine;
227
            String docID;
228
            String pageStart;
229
            String pageEnd;
230
            String taxon;
231
            String link;
232
            String urlstr="";
233

    
234
            for(String modsID : modsList){
235
                //        plaziUrl=plaziUrl+"Eupolybothrus";
236
                if (importFilter.equalsIgnoreCase("modsid")) {
237
                    urlstr=plaziUrlDoc+modsID;
238
                }
239
                if (importFilter.equalsIgnoreCase("taxon")) {
240
                    urlstr=plaziUrl+modsID;
241
                }
242
//                System.out.println(url);
243

    
244
                plaziURL = new URL(urlstr);
245
                in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
246

    
247

    
248
                //TODO lastUpdate field
249
                //            if(!plaziNotServer){
250
                while ((inputLine = in.readLine()) != null) {
251
                    System.out.println(inputLine);
252
                    if (inputLine.startsWith("<treatment ")){
253
                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
254
                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
255
                        System.out.println("docID: "+docID);
256
                        link=inputLine.split("link=\"")[1].split("\"")[0];
257
                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
258
                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
259
                        docList = documents.get(docID);
260
                        if (docList == null) {
261
                            docList = new ArrayList<String>();
262
                        }
263
                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
264
                        documents.put(docID,docList);
265
                    }
266
                }
267
            }
268
            System.out.println("hop");
269

    
270

    
271

    
272
            for (String docId:documents.keySet()){
273
                in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
274
                while ((inputLine = in.readLine()) != null) {
275
                    if (inputLine.startsWith("<treatment ")){
276
                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
277
                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
278
                        link=inputLine.split("link=\"")[1].split("\"")[0];
279
                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
280
                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
281
                        docList = documents.get(docID);
282
                        if (docList == null) {
283
                            docList = new ArrayList<String>();
284
                        }
285
                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
286
                        docs.put(docID,docList);
287
                    }
288
                }
289
            }
290
            //            if(plaziNotServer) {
291
            //                sourcesStr.add(plaziUrl);
292
            //            }
293
            //            in.close();
294
        } catch (MalformedURLException e1) {
295
            // TODO Auto-generated catch block
296
            e1.printStackTrace();
297
        } catch (IOException e) {
298
            // TODO Auto-generated catch block
299
            e.printStackTrace();
300
        }
301

    
302
        //        System.exit(0);
303

    
304
        //        sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
305

    
306
        //System.out.println(documents);
307
        for (String docId : docs.keySet()){
308
            List<String> treatments = new ArrayList<String>(new HashSet<String>(docs.get(docId)));
309

    
310
            Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
311
            for (String treatment:treatments) {
312
                List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
313
                if (tmplist == null) {
314
                    tmplist = new ArrayList<String>();
315
                }
316
                tmplist.add(treatment.split("---")[3]);
317
                startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
318
            }
319
            List<Integer> pages = new ArrayList<Integer>();
320
            pages.addAll(startPages.keySet());
321

    
322
            Collections.sort(pages);
323
            //            log.info(pages);
324

    
325
            log.info("Document "+docId+" should have "+treatments.size()+" treatments");
326
            int cnt=0;
327
            if(treatments.size()<150){
328

    
329
            for (String source:treatments){
330
                DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
331
                DocumentBuilder builder;
332
                URL url;
333

    
334
                try {
335
                    builder = factory.newDocumentBuilder();
336
                    url = new URL(source.split("---")[3]);
337
                    Object o = url.getContent();
338
                    InputStream is = (InputStream) o;
339
                    Document document = builder.parse(is);
340
                    cnt++;
341
                }catch(Exception e){
342
                    //  e.printStackTrace();
343
                    log.warn(e);
344
                }
345
            }
346
            log.info("Document "+docId+" has "+cnt+" treatments available");
347
            }
348
            if(treatments.size() != cnt)
349
            {
350
                File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
351
                FileWriter writer;
352
                try {
353
                    writer = new FileWriter(file ,true);
354
                    writer.write(docId+"\n");
355
                    writer.flush();
356
                    writer.close();
357
                } catch (IOException e1) {
358
                    // TODO Auto-generated catch block
359
                    e1.printStackTrace();
360
                }
361

    
362
            }
363
            else{
364
                List<URI> uritmp = documentMap.get(docId);
365
                if (uritmp == null) {
366
                    uritmp = new ArrayList<URI>();
367
                }
368
                for (int page:pages) {
369
                    for (String treatment: startPages.get(page)) {
370
                        try {
371
                            uritmp.add(new URL(treatment).toURI());
372
                        } catch (MalformedURLException e) {
373
                            // TODO Auto-generated catch block
374
                            e.printStackTrace();
375
                        } catch (URISyntaxException e) {
376
                            // TODO Auto-generated catch block
377
                            e.printStackTrace();
378
                        }
379
                    }
380
                }
381
                documentMap.put(docId, uritmp);
382
            }
383

    
384

    
385

    
386

    
387
        }
388
        //////        log.info("NB SOURCES : "+sourcesStr.size());
389
        //        List<URI> sourcesStr = new ArrayList<URI>();
390
        //        try {
391
        ////            documentMap = new HashMap<String, List<URI>>();
392
        //            sourcesStr.add(new URI("http://plazi.cs.umb.edu/GgServer/cdmSync/8F5B3EA099D371BC41CC5DDBFEDCFBED"));
393
        //            documentMap.put("singlesource", sourcesStr);
394
        //        } catch (URISyntaxException e) {
395
        //            // TODO Auto-generated catch block
396
        //            e.printStackTrace();
397
        //        }
398

    
399
        return documentMap;
400

    
401
    }
402

    
403
    /**
404
     * @param document
405
     * @return
406
     */
407
    private static boolean doImportDocument(String document, int nbtreatments) {
408
        return true;
409
      /*
410
        //        List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
411
        //                "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
412
        //                "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
413
        //               // "8776-7797-8303"});
414
        //        if (docDone.contains(document)) {
415
        //            return false;
416
        //        }
417

    
418
        JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
419
        JScrollPane scrollPane = new JScrollPane(textArea);
420
        textArea.setLineWrap(true);
421
        textArea.setWrapStyleWord(true);
422
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
423

    
424
        //        JFrame frame = new JFrame("I have a question");
425
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
426
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
427
        if (s==0) {
428
            return true;
429
        } else {
430
            return false;
431
        }
432
        */
433
    }
434

    
435
    /**
436
     * @return
437
     */
438
    private static boolean askIfReuseSecundum() {
439
        //        logger.info("getFullReference for "+ name);
440
        JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
441
                "\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
442
        JScrollPane scrollPane = new JScrollPane(textArea);
443
        textArea.setLineWrap(true);
444
        textArea.setWrapStyleWord(true);
445
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
446

    
447
        //        JFrame frame = new JFrame("I have a question");
448
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
449
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
450
        if (s==0) {
451
            return true;
452
        } else {
453
            return false;
454
        }
455
    }
456

    
457
    /**
458
     * @return
459
     */
460
    private static Reference<?> askForSecundum() {
461
        //        logger.info("getFullReference for "+ name);
462
        JTextArea textArea = new JTextArea("Enter the secundum name");
463
        JScrollPane scrollPane = new JScrollPane(textArea);
464
        textArea.setLineWrap(true);
465
        textArea.setWrapStyleWord(true);
466
        scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
467

    
468
        //        JFrame frame = new JFrame("I have a question");
469
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
470
        String s = (String) JOptionPane.showInputDialog(
471
                null,
472
                scrollPane,
473
                "",
474
                JOptionPane.PLAIN_MESSAGE,
475
                null,
476
                null,
477
                null);
478
        Reference<?> ref = ReferenceFactory.newGeneric();
479
        ref.setTitle(s);
480
        return ref;
481
    }
482

    
483

    
484
}
(3-3/3)