Project

General

Profile

Download (18.5 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.app.proibiosphere;
11
import java.awt.Dimension;
12
import java.io.BufferedReader;
13
import java.io.IOException;
14
import java.io.InputStreamReader;
15
import java.net.MalformedURLException;
16
import java.net.URI;
17
import java.net.URISyntaxException;
18
import java.net.URL;
19
import java.util.ArrayList;
20
import java.util.Collections;
21
import java.util.HashMap;
22
import java.util.HashSet;
23
import java.util.List;
24
import java.util.Map;
25
import java.util.Scanner;
26

    
27
import javax.swing.JOptionPane;
28
import javax.swing.JScrollPane;
29
import javax.swing.JTextArea;
30

    
31
import org.apache.log4j.Logger;
32

    
33
import eu.etaxonomy.cdm.app.common.CdmDestinations;
34
import eu.etaxonomy.cdm.database.DbSchemaValidation;
35
import eu.etaxonomy.cdm.database.ICdmDataSource;
36
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
37
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
38
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
39
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
40
import eu.etaxonomy.cdm.model.reference.Reference;
41
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
42

    
43

    
44

    
45
public class TaxonXImportLauncher {
46
    private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
47
    //    private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
48

    
49
    //database validation status (create, update, validate ...)
50
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
51
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
52
//    static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
53
  static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
54
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_production_piB("piB_nephrolepis");
55
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_local_piB("guianas");
56
    
57
    static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
58
    
59
    private enum FilterType{MODS, TAXON};
60

    
61

    
62
    static String plaziUrlTaxName = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
63
    static String plaziUrlModsDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
64

    
65

    
66

    
67
    public static void main(String[] args) {
68
    	String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
69
    	
70
    	String[] taxonList = new String[]  {"Campylopus"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
71
//       /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"  */};
72
//        String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
73
//        /*auch ants*/        debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
74
//        suite: , };//,"3540555099"};
75
//        modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
76
//    	taxonList = spiderModsList;
77
    	
78
    	FilterType filterType = FilterType.TAXON;
79
        
80
    	NomenclaturalCode tnomenclature = NomenclaturalCode.ICNAFP;
81

    
82
        String defaultClassification= null;// "Nephrolepis";
83
        boolean alwaysUseDefaultClassification = false;
84
        
85
        boolean useOldUnparsedSynonymExtraction = false;
86

    
87
        
88
        
89
        
90
        Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
91

    
92
        /*HOW TO HANDLE SECUNDUM REFERENCE*/
93
        boolean reuseSecundum = askIfReuseSecundum();
94
        Reference<?> secundum = null;
95
        if (!reuseSecundum) {
96
            secundum = askForSecundum();
97
        }
98

    
99
        loadTreatmentIfPresent(filterType,taxonList, documentMap);
100
//        loadTreatmentIfPresent(FilterType.MODS, modsList, documents,documentMap);
101

    
102
        CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
103

    
104
        ICdmDataSource destination = cdmDestination;
105
        TaxonXImportConfigurator config = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
106
        config.setUseOldUnparsedSynonymExtraction(useOldUnparsedSynonymExtraction);
107
        
108
        config.setImportClassificationName(defaultClassification);
109
        log.info("Start import from  TaxonX Data");
110

    
111
        config.setLastImport(false);
112

    
113
        int j=0;
114
        for (String document : documentMap.keySet()){
115
            j++;
116
            if (doImportDocument(document, documentMap.get(document).size())){
117
                int i=0;
118
                for (URI source: documentMap.get(document)){
119
                    System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
120
                    i++;
121
                    if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
122
                        config.setLastImport(true);
123
                    }
124
                    prepareReferenceAndSource(config,source);
125
                     //   taxonxImportConfigurator.setTaxonReference(null);
126
                    taxonImport.invoke(config);
127
                    log.info("End import from SpecimenData ("+ source.toString() + ")...");
128

    
129
                    //          //deduplicate
130
                    //            ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
131
                    //            int count = app.getAgentService().deduplicate(Person.class, null, null);
132
                    //            logger.warn("Deduplicated " + count + " persons.");
133
                    //            count = app.getReferenceService().deduplicate(Reference.class, null, null);
134
                    //            logger.warn("Deduplicated " + count + " references.");
135
                }
136
            }
137
        }
138
    }
139

    
140

    
141
    private static String askQuestion(String question){
142
        Scanner scan = new Scanner(System.in);
143
        System.out.println(question);
144
        String index = scan.nextLine();
145
        return index;
146
    }
147

    
148
    /**
149
     * @param taxonxImportConfigurator
150
     * @param source
151
     *
152
     */
153
    private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
154
        Reference<?> reference = ReferenceFactory.newGeneric();
155
        //            String tref = askQuestion("Import source? (ie Plazi document ID)");
156
        String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
157
        reference.setTitleCache(tref,true);
158
        reference.setTitle(tref);
159
        
160
        taxonxImportConfigurator.setSourceReference(reference);
161
        TaxonXImportConfigurator.setSourceRef(reference);
162

    
163
        Reference<?> referenceUrl = ReferenceFactory.newWebPage();
164
        referenceUrl.setTitleCache(source.toString(), true);
165
        referenceUrl.setTitle(source.toString());
166
        reference.setUri(source);
167
        referenceUrl.generateTitle();
168

    
169
        taxonxImportConfigurator.addOriginalSource(referenceUrl);
170
        taxonxImportConfigurator.setSource(source);
171
    }
172

    
173
    /**
174
     * @param destination
175
     * @param reuseSecundum
176
     * @param secundum
177
     * @param tnomenclature 
178
     * @param alwaysUseDefaultClassification 
179
     * @return
180
     */
181
    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference<?> secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
182
        TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
183

    
184
        //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
185
        taxonxImportConfigurator.setCheck(check);
186
        taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
187
        taxonxImportConfigurator.setDoAutomaticParsing(true);
188

    
189
        taxonxImportConfigurator.setInteractWithUser(true);
190
        taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
191

    
192
        taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
193

    
194
        taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
195
        if (!reuseSecundum) {
196
            taxonxImportConfigurator.setSecundum(secundum);
197
        }
198

    
199
        //taxonxImportConfigurator.setDoMatchTaxa(true);
200
        // taxonxImportConfigurator.setReUseTaxon(true);
201
        return taxonxImportConfigurator;
202
    }
203

    
204
    /**
205
     * @param filterType
206
     * @param modsList
207
     * @param documents
208
     * @param documentMap
209
     * @return
210
     */
211
    private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
212

    
213
    	Map<String, List<String>> docs = new HashMap<String, List<String>>();
214
        try {
215
            List<String> docList;
216
            String inputLine;
217
            String urlstr="";
218

    
219
            Map<String,List<String>> documents =  fillDocumentMap(filterType, filterList, urlstr);
220

    
221
//            checkTreatmentAvailable(documents, docs);
222
            docs = documents;
223

    
224
        } catch (Exception e1) {
225
            e1.printStackTrace();
226
        }
227

    
228
        //System.out.println(documents);
229
        for (String docId : docs.keySet()){
230
            List<String> treatments = new ArrayList<String>(new HashSet<String>(docs.get(docId)));
231

    
232
            Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
233
            for (String treatment:treatments) {
234
                List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
235
                if (tmplist == null) {
236
                    tmplist = new ArrayList<String>();
237
                }
238
                tmplist.add(treatment.split("---")[3]);
239
                startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
240
            }
241
            List<Integer> pages = new ArrayList<Integer>();
242
            pages.addAll(startPages.keySet());
243

    
244
            Collections.sort(pages);
245
            //            log.info(pages);
246

    
247
            log.info("Document "+docId+" should have "+treatments.size()+" treatments");
248
                List<URI> uritmp = documentMap.get(docId);
249
                if (uritmp == null) {
250
                    uritmp = new ArrayList<URI>();
251
                }
252
                for (int page:pages) {
253
                    for (String treatment: startPages.get(page)) {
254
                        try {
255
                            uritmp.add(new URL(treatment).toURI());
256
                        } catch (MalformedURLException e) {
257
                            // TODO Auto-generated catch block
258
                            e.printStackTrace();
259
                        } catch (URISyntaxException e) {
260
                            // TODO Auto-generated catch block
261
                            e.printStackTrace();
262
                        }
263
                    }
264
                }
265
                documentMap.put(docId, uritmp);
266
            }
267

    
268

    
269

    
270

    
271

    
272

    
273
        return documentMap;
274

    
275
    }
276

    
277
	private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
278
			throws IOException, MalformedURLException {
279
		List<String> docList;
280
		String inputLine;
281
		for (String docId:documents.keySet()){
282
			URL url = new URL(plaziUrlModsDoc+docId);
283
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
284
		    while ((inputLine = in.readLine()) != null) {
285
		        if (inputLine.startsWith("<treatment ")){
286
		            String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
287
		            String docID=inputLine.split("docId=\"")[1].split("\"")[0];
288
		            String link=inputLine.split("link=\"")[1].split("\"")[0];
289
		            String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
290
		            String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
291
		            docList = documents.get(docID);
292
		            if (docList == null) {
293
		                docList = new ArrayList<String>();
294
		            }
295
		            docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
296
		            docs.put(docID,docList);
297
		        }
298
		    }
299
		}
300
	}
301

    
302
	private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
303
			String[] filterList, String urlstr) 
304
					throws MalformedURLException, IOException {
305
		
306
		Map<String, List<String>> documents = new HashMap<String, List<String>>();
307
		List<String> docList;
308
		String inputLine;
309
		for(String filter : filterList){
310
		    //        plaziUrl=plaziUrl+"Eupolybothrus";
311
		    if (filterType == FilterType.MODS) {
312
		        urlstr=plaziUrlModsDoc + filter;
313
		    }else if (filterType == FilterType.TAXON) {
314
		        urlstr=plaziUrlTaxName + filter;
315
		    }
316
		    log.info("URLstr: " + urlstr);
317

    
318
		    URL plaziURL = new URL(urlstr);
319
		    BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
320

    
321

    
322
		    //TODO lastUpdate field
323
		    //            if(!plaziNotServer){
324
		    while ((inputLine = in.readLine()) != null) {
325
		        System.out.println(inputLine);
326
		        if (inputLine.startsWith("<treatment ")){
327
		            String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
328
		            String docID=inputLine.split("docId=\"")[1].split("\"")[0];
329
		            System.out.println("docID: "+docID);
330
		            
331
		            String link=inputLine.split("link=\"")[1].split("\"")[0];
332
		            String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
333
		            String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
334
		            docList = documents.get(docID);
335
		            if (docList == null) {
336
		                docList = new ArrayList<String>();
337
		            }
338
		            docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
339
		            documents.put(docID,docList);
340
		        }
341
		    }
342
		}
343
		System.out.println("documents created");
344
		
345
		return documents;
346
	}
347

    
348
    /**
349
     * @param document
350
     * @return
351
     */
352
    private static boolean doImportDocument(String document, int nbtreatments) {
353

    
354
        if (nbtreatments>400) {
355
            return false;
356
        }
357
        if (document.equalsIgnoreCase("1314-2828-2")) { //this is a mix of several publications..
358
            return false;
359
        }
360
        if (document.equalsIgnoreCase("21367")) { //600treatments for ants..
361
            return false;
362
        }
363
        if (document.equalsIgnoreCase("1314-2828-1")) { //900treatments for eupoly..
364
            return false;
365
        }
366
        return true;
367
      /*
368
        //        List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
369
        //                "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
370
        //                "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
371
        //               // "8776-7797-8303"});
372
        //        if (docDone.contains(document)) {
373
        //            return false;
374
        //        }
375

    
376
        JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
377
        JScrollPane scrollPane = new JScrollPane(textArea);
378
        textArea.setLineWrap(true);
379
        textArea.setWrapStyleWord(true);
380
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
381

    
382
        //        JFrame frame = new JFrame("I have a question");
383
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
384
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
385
        if (s==0) {
386
            return true;
387
        } else {
388
            return false;
389
        }
390
        */
391
    }
392

    
393
    /**
394
     * @return
395
     */
396
    private static boolean askIfReuseSecundum() {
397
        //        logger.info("getFullReference for "+ name);
398
        JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
399
                "\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
400
        JScrollPane scrollPane = new JScrollPane(textArea);
401
        textArea.setLineWrap(true);
402
        textArea.setWrapStyleWord(true);
403
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
404

    
405
        //        JFrame frame = new JFrame("I have a question");
406
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
407
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
408
        if (s==0) {
409
            return true;
410
        } else {
411
            return false;
412
        }
413
    }
414

    
415
    /**
416
     * @return
417
     */
418
    private static Reference<?> askForSecundum() {
419
        //        logger.info("getFullReference for "+ name);
420
        JTextArea textArea = new JTextArea("Enter the secundum name");
421
        JScrollPane scrollPane = new JScrollPane(textArea);
422
        textArea.setLineWrap(true);
423
        textArea.setWrapStyleWord(true);
424
        scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
425

    
426
        //        JFrame frame = new JFrame("I have a question");
427
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
428
        String s = (String) JOptionPane.showInputDialog(
429
                null,
430
                scrollPane,
431
                "",
432
                JOptionPane.PLAIN_MESSAGE,
433
                null,
434
                null,
435
                null);
436
        Reference<?> ref = ReferenceFactory.newGeneric();
437
        ref.setTitle(s);
438
        return ref;
439
    }
440

    
441

    
442
}
(3-3/3)