Project

General

Profile

Download (18.2 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.app.proibiosphere;
11
import java.awt.Dimension;
12
import java.io.BufferedReader;
13
import java.io.IOException;
14
import java.io.InputStreamReader;
15
import java.net.MalformedURLException;
16
import java.net.URI;
17
import java.net.URISyntaxException;
18
import java.net.URL;
19
import java.util.ArrayList;
20
import java.util.Collections;
21
import java.util.HashMap;
22
import java.util.HashSet;
23
import java.util.List;
24
import java.util.Map;
25
import java.util.Scanner;
26

    
27
import javax.swing.JOptionPane;
28
import javax.swing.JScrollPane;
29
import javax.swing.JTextArea;
30

    
31
import org.apache.log4j.Logger;
32

    
33
import eu.etaxonomy.cdm.app.common.CdmDestinations;
34
import eu.etaxonomy.cdm.database.DbSchemaValidation;
35
import eu.etaxonomy.cdm.database.ICdmDataSource;
36
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
37
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
38
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
39
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
40
import eu.etaxonomy.cdm.model.reference.Reference;
41
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
42

    
43

    
44

    
45
public class TaxonXImportLauncher {
46
    private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
47
    //    private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
48

    
49
    //database validation status (create, update, validate ...)
50
    static DbSchemaValidation hbm2dll = DbSchemaValidation.CREATE;
51
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql();
52
//    static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
53
  static final ICdmDataSource cdmDestination = CdmDestinations.cdm_test_local_mysql_test();
54
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_production_piB("piB_nephrolepis");
55
//    static final ICdmDataSource cdmDestination = CdmDestinations.cdm_local_piB("guianas");
56

    
57
    static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
58

    
59
    private enum FilterType{MODS, TAXON};
60

    
61

    
62
    static String plaziUrlTaxName = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
63
    static String plaziUrlModsDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2Fresources%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
64

    
65

    
66

    
67
    public static void main(String[] args) {
68
    	String[] spiderModsList = new String[] {"zt03768p138","zt03750p196","zt03666p193","zt03664p068","zt03646p592","zt03507p056","zt03415p057","zt03383p038","zt03305p052","zt03228p068","zt03131p034","zt02963p068","zt02883p068","zt02814p018","zt02739p050","zt02730p043","zt02637p054","zt02593p127","zt02551p068","zt02534p036","zt02526p053","zt02427p035","zt02361p012","zt02267p068","zt02223p047","zt01826p058","zt01775p024","zt01744p040","zt01529p060","zt01004p028","zt00904","zt00872","zt00619","zt00109","DippenaarSchoeman1989Penestominae","Simon1902Cribellates","Simon1903Penestominae","Lehtinen1967CribellatePenestominae"};
69

    
70
    	String[] taxonList = new String[]  {"Campylopus"}; //{"Eupolybothrus","Polybothrus"}, Chenopodium, Lactarius, Campylopus, Nephrolepis, Comaroma (spiders)
71
//       /*ants Anochetus*/ String[] modsList = new String[] {"3924" /*, "3743", "4375", "6757", "6752", "3481", "21401_fisher_smith_plos_2008", "2592", "4096", "6877", "6192", "8071"  */};
72
//        String[] modsList = new String[] {"21367", "21365", "8171", "6877", "21820", "3641", "6757"};
73
//        /*auch ants*/        debut="3743", "3628", "4022", "3994", "3603", "8070", "4001", "4071", "3948", "3481"};
74
//        suite: , };//,"3540555099"};
75
//        modsList = new String[] {"Zapparoli-1986-Eupolybothrus-fasciatus"};
76
//    	taxonList = spiderModsList;
77

    
78
    	FilterType filterType = FilterType.TAXON;
79

    
80
    	NomenclaturalCode tnomenclature = NomenclaturalCode.ICNAFP;
81

    
82
        String defaultClassification= null;// "Nephrolepis";
83
        boolean alwaysUseDefaultClassification = false;
84

    
85
        boolean useOldUnparsedSynonymExtraction = false;
86

    
87

    
88

    
89

    
90
        Map<String,List<URI>>documentMap = new HashMap<String, List<URI>>();
91

    
92
        /*HOW TO HANDLE SECUNDUM REFERENCE*/
93
        boolean reuseSecundum = askIfReuseSecundum();
94
        Reference secundum = null;
95
        if (!reuseSecundum) {
96
            secundum = askForSecundum();
97
        }
98

    
99
        loadTreatmentIfPresent(filterType,taxonList, documentMap);
100
//        loadTreatmentIfPresent(FilterType.MODS, modsList, documents,documentMap);
101

    
102
        CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
103

    
104
        ICdmDataSource destination = cdmDestination;
105
        TaxonXImportConfigurator config = prepareTaxonXImport(destination,reuseSecundum, secundum, tnomenclature, alwaysUseDefaultClassification);
106
        config.setUseOldUnparsedSynonymExtraction(useOldUnparsedSynonymExtraction);
107

    
108
        config.setImportClassificationName(defaultClassification);
109
        log.info("Start import from  TaxonX Data");
110

    
111
        config.setLastImport(false);
112

    
113
        int j=0;
114
        for (String document : documentMap.keySet()){
115
            j++;
116
            if (doImportDocument(document, documentMap.get(document).size())){
117
                int i=0;
118
                for (URI source: documentMap.get(document)){
119
                    System.out.println("START "+document+" "+i+" ("+(documentMap.get(document)).size()+"): "+source.getPath());
120
                    i++;
121
                    if (j==documentMap.keySet().size() && i==documentMap.get(document).size()) {
122
                        config.setLastImport(true);
123
                    }
124
                    prepareReferenceAndSource(config,source);
125
                     //   taxonxImportConfigurator.setTaxonReference(null);
126
                    taxonImport.invoke(config);
127
                    log.info("End import from SpecimenData ("+ source.toString() + ")...");
128

    
129
                    //          //deduplicate
130
                    //            ICdmRepository app = taxonImport.getCdmAppController();
131
                    //            int count = app.getAgentService().deduplicate(Person.class, null, null);
132
                    //            logger.warn("Deduplicated " + count + " persons.");
133
                    //            count = app.getReferenceService().deduplicate(Reference.class, null, null);
134
                    //            logger.warn("Deduplicated " + count + " references.");
135
                }
136
            }
137
        }
138
    }
139

    
140

    
141
    private static String askQuestion(String question){
142
        Scanner scan = new Scanner(System.in);
143
        System.out.println(question);
144
        String index = scan.nextLine();
145
        return index;
146
    }
147

    
148
    /**
149
     * @param taxonxImportConfigurator
150
     * @param source
151
     *
152
     */
153
    private static void prepareReferenceAndSource(TaxonXImportConfigurator taxonxImportConfigurator, URI source) {
154
        Reference reference = ReferenceFactory.newGeneric();
155
        //            String tref = askQuestion("Import source? (ie Plazi document ID)");
156
        String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
157
        reference.setTitleCache(tref,true);
158
        reference.setTitle(tref);
159

    
160
        taxonxImportConfigurator.setSourceReference(reference);
161
        TaxonXImportConfigurator.setSourceRef(reference);
162

    
163
        Reference referenceUrl = ReferenceFactory.newWebPage();
164
        referenceUrl.setTitleCache(source.toString(), true);
165
        referenceUrl.setTitle(source.toString());
166
        reference.setUri(source);
167
        referenceUrl.generateTitle();
168

    
169
        taxonxImportConfigurator.addOriginalSource(referenceUrl);
170
        taxonxImportConfigurator.setSource(source);
171
    }
172

    
173
    /**
174
     * @param destination
175
     * @param reuseSecundum
176
     * @param secundum
177
     * @param tnomenclature
178
     * @param alwaysUseDefaultClassification
179
     * @return
180
     */
181
    private static TaxonXImportConfigurator prepareTaxonXImport(ICdmDataSource destination, boolean reuseSecundum, Reference secundum, NomenclaturalCode tnomenclature, boolean alwaysUseDefaultClassification) {
182
        TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
183

    
184
        //taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
185
        taxonxImportConfigurator.setCheck(check);
186
        taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
187
        taxonxImportConfigurator.setDoAutomaticParsing(true);
188

    
189
        taxonxImportConfigurator.setInteractWithUser(true);
190
        taxonxImportConfigurator.setNomenclaturalCode(tnomenclature);
191

    
192
        taxonxImportConfigurator.setAlwaysUseDefaultClassification(alwaysUseDefaultClassification);
193

    
194
        taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
195
        if (!reuseSecundum) {
196
            taxonxImportConfigurator.setSecundum(secundum);
197
        }
198

    
199
        //taxonxImportConfigurator.setDoMatchTaxa(true);
200
        // taxonxImportConfigurator.setReUseTaxon(true);
201
        return taxonxImportConfigurator;
202
    }
203

    
204
    /**
205
     * @param filterType
206
     * @param modsList
207
     * @param documents
208
     * @param documentMap
209
     * @return
210
     */
211
    private static Map<String, List<URI>> loadTreatmentIfPresent(FilterType filterType, String[] filterList, Map<String, List<URI>> documentMap) {
212

    
213
    	Map<String, List<String>> docs = new HashMap<String, List<String>>();
214
        try {
215
            List<String> docList;
216
            String inputLine;
217
            String urlstr="";
218

    
219
            Map<String,List<String>> documents =  fillDocumentMap(filterType, filterList, urlstr);
220

    
221
//            checkTreatmentAvailable(documents, docs);
222
            docs = documents;
223

    
224
        } catch (Exception e1) {
225
            e1.printStackTrace();
226
        }
227

    
228
        //System.out.println(documents);
229
        for (String docId : docs.keySet()){
230
            List<String> treatments = new ArrayList<>(new HashSet<>(docs.get(docId)));
231

    
232
            Map<Integer, List<String>> startPages = new HashMap<>();
233
            for (String treatment:treatments) {
234
                List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
235
                if (tmplist == null) {
236
                    tmplist = new ArrayList<>();
237
                }
238
                tmplist.add(treatment.split("---")[3]);
239
                startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
240
            }
241
            List<Integer> pages = new ArrayList<>();
242
            pages.addAll(startPages.keySet());
243

    
244
            Collections.sort(pages);
245
            //            log.info(pages);
246

    
247
            log.info("Document "+docId+" should have "+treatments.size()+" treatments");
248
                List<URI> uritmp = documentMap.get(docId);
249
                if (uritmp == null) {
250
                    uritmp = new ArrayList<>();
251
                }
252
                for (int page:pages) {
253
                    for (String treatment: startPages.get(page)) {
254
                        try {
255
                            uritmp.add(new URL(treatment).toURI());
256
                        } catch (MalformedURLException e) {
257
                            // TODO Auto-generated catch block
258
                            e.printStackTrace();
259
                        } catch (URISyntaxException e) {
260
                            // TODO Auto-generated catch block
261
                            e.printStackTrace();
262
                        }
263
                    }
264
                }
265
                documentMap.put(docId, uritmp);
266
            }
267

    
268
        return documentMap;
269

    
270
    }
271

    
272
	private static void checkTreatmentAvailable(Map<String, List<String>> documents, Map<String, List<String>> docs)
273
			throws IOException, MalformedURLException {
274
		List<String> docList;
275
		String inputLine;
276
		for (String docId:documents.keySet()){
277
			URL url = new URL(plaziUrlModsDoc+docId);
278
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
279
		    while ((inputLine = in.readLine()) != null) {
280
		        if (inputLine.startsWith("<treatment ")){
281
		            String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
282
		            String docID=inputLine.split("docId=\"")[1].split("\"")[0];
283
		            String link=inputLine.split("link=\"")[1].split("\"")[0];
284
		            String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
285
		            String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
286
		            docList = documents.get(docID);
287
		            if (docList == null) {
288
		                docList = new ArrayList<String>();
289
		            }
290
		            docList.add(pageStart+"---" + pageEnd + "---" + taxon + "---"+link);
291
		            docs.put(docID,docList);
292
		        }
293
		    }
294
		}
295
	}
296

    
297
	private static Map<String, List<String>> fillDocumentMap(FilterType filterType,
298
			String[] filterList, String urlstr)
299
					throws MalformedURLException, IOException {
300

    
301
		Map<String, List<String>> documents = new HashMap<>();
302
		List<String> docList;
303
		String inputLine;
304
		for(String filter : filterList){
305
		    //        plaziUrl=plaziUrl+"Eupolybothrus";
306
		    if (filterType == FilterType.MODS) {
307
		        urlstr=plaziUrlModsDoc + filter;
308
		    }else if (filterType == FilterType.TAXON) {
309
		        urlstr=plaziUrlTaxName + filter;
310
		    }
311
		    log.info("URLstr: " + urlstr);
312

    
313
		    URL plaziURL = new URL(urlstr);
314
		    BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
315

    
316
		    //TODO lastUpdate field
317
		    //            if(!plaziNotServer){
318
		    while ((inputLine = in.readLine()) != null) {
319
		        System.out.println(inputLine);
320
		        if (inputLine.startsWith("<treatment ")){
321
		            String taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
322
		            String docID=inputLine.split("docId=\"")[1].split("\"")[0];
323
		            System.out.println("docID: "+docID);
324

    
325
		            String link=inputLine.split("link=\"")[1].split("\"")[0];
326
		            String pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
327
		            String pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
328
		            docList = documents.get(docID);
329
		            if (docList == null) {
330
		                docList = new ArrayList<String>();
331
		            }
332
		            docList.add(pageStart+"---" + pageEnd + "---"+taxon+"---"+link);
333
		            documents.put(docID,docList);
334
		        }
335
		    }
336
		}
337
		System.out.println("documents created");
338

    
339
		return documents;
340
	}
341

    
342
    private static boolean doImportDocument(String document, int nbtreatments) {
343

    
344
        if (nbtreatments>400) {
345
            return false;
346
        }
347
        if (document.equalsIgnoreCase("1314-2828-2")) { //this is a mix of several publications..
348
            return false;
349
        }
350
        if (document.equalsIgnoreCase("21367")) { //600treatments for ants..
351
            return false;
352
        }
353
        if (document.equalsIgnoreCase("1314-2828-1")) { //900treatments for eupoly..
354
            return false;
355
        }
356
        return true;
357
      /*
358
        //        List<String> docDone = Arrays.asList(new String[]{"3540555099", "0910-2878-5652", "5012-9059-4108",
359
        //                "3784-0748-2261","3-201-00728-5", "FloNuttDuWin1838", "FlNordica_chenop","2580-1363-7530",
360
        //                "1842460692","5161-7797-8064","FlCaboVerde_Chen","2819-9661-8339","2626-3794-9273"});//,
361
        //               // "8776-7797-8303"});
362
        //        if (docDone.contains(document)) {
363
        //            return false;
364
        //        }
365

    
366
        JTextArea textArea = new JTextArea("Should this document be imported ("+nbtreatments+")? \n'"+document+"'");
367
        JScrollPane scrollPane = new JScrollPane(textArea);
368
        textArea.setLineWrap(true);
369
        textArea.setWrapStyleWord(true);
370
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
371

    
372
        //        JFrame frame = new JFrame("I have a question");
373
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
374
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
375
        if (s==0) {
376
            return true;
377
        } else {
378
            return false;
379
        }
380
        */
381
    }
382

    
383
    private static boolean askIfReuseSecundum() {
384
        //        logger.info("getFullReference for "+ name);
385
        JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
386
                "\n Click Yes to reuse it, click No or Cancel to create a new one.\nA default secundum will be created if needed.");
387
        JScrollPane scrollPane = new JScrollPane(textArea);
388
        textArea.setLineWrap(true);
389
        textArea.setWrapStyleWord(true);
390
        scrollPane.setPreferredSize( new Dimension( 700, 70 ) );
391

    
392
        //        JFrame frame = new JFrame("I have a question");
393
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
394
        int s = JOptionPane.showConfirmDialog(null, scrollPane);
395
        if (s==0) {
396
            return true;
397
        } else {
398
            return false;
399
        }
400
    }
401

    
402
    private static Reference askForSecundum() {
403
        //        logger.info("getFullReference for "+ name);
404
        JTextArea textArea = new JTextArea("Enter the secundum name");
405
        JScrollPane scrollPane = new JScrollPane(textArea);
406
        textArea.setLineWrap(true);
407
        textArea.setWrapStyleWord(true);
408
        scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
409

    
410
        //        JFrame frame = new JFrame("I have a question");
411
        //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
412
        String s = (String) JOptionPane.showInputDialog(
413
                null,
414
                scrollPane,
415
                "",
416
                JOptionPane.PLAIN_MESSAGE,
417
                null,
418
                null,
419
                null);
420
        Reference ref = ReferenceFactory.newGeneric();
421
        ref.setTitle(s);
422
        return ref;
423
    }
424
}
(3-3/3)