Project

General

Profile

Download (14.9 KB) Statistics
| Branch: | Revision:
1
/**
2
 * Copyright (C) 2007 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9

    
10
package eu.etaxonomy.cdm.app.proibiosphere;
11
import java.awt.Dimension;
12
import java.io.BufferedReader;
13
import java.io.File;
14
import java.io.FileWriter;
15
import java.io.IOException;
16
import java.io.InputStream;
17
import java.io.InputStreamReader;
18
import java.net.MalformedURLException;
19
import java.net.URI;
20
import java.net.URISyntaxException;
21
import java.net.URL;
22
import java.util.ArrayList;
23
import java.util.Collections;
24
import java.util.HashMap;
25
import java.util.HashSet;
26
import java.util.List;
27
import java.util.Map;
28
import java.util.Scanner;
29

    
30
import javax.swing.JOptionPane;
31
import javax.swing.JScrollPane;
32
import javax.swing.JTextArea;
33
import javax.xml.parsers.DocumentBuilder;
34
import javax.xml.parsers.DocumentBuilderFactory;
35

    
36
import org.apache.log4j.Logger;
37
import org.w3c.dom.Document;
38

    
39
import eu.etaxonomy.cdm.app.common.CdmDestinations;
40
import eu.etaxonomy.cdm.database.DbSchemaValidation;
41
import eu.etaxonomy.cdm.database.ICdmDataSource;
42
import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
43
import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;
44
import eu.etaxonomy.cdm.io.taxonx2013.TaxonXImportConfigurator;
45
import eu.etaxonomy.cdm.model.name.NomenclaturalCode;
46
import eu.etaxonomy.cdm.model.reference.Reference;
47
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
48

    
49

    
50

    
51
public class TaxonXImportLauncher {
52
    private static final Logger log = Logger.getLogger(TaxonXImportLauncher.class);
53
    //    private static final Logger log = Logger.getLogger(CdmEntityDaoBase.class);
54

    
55
    //database validation status (create, update, validate ...)
56
    static DbSchemaValidation hbm2dll = DbSchemaValidation.VALIDATE;
57
    static final ICdmDataSource cdmDestination = CdmDestinations.proibiosphere_local();
58

    
59
    static final CHECK check = CHECK.IMPORT_WITHOUT_CHECK;
60

    
61

    
62
    private static String askQuestion(String question){
63
        Scanner scan = new Scanner(System.in);
64
        System.out.println(question);
65
        String index = scan.nextLine();
66
        return index;
67
    }
68

    
69
    public static void main(String[] args) {
70

    
71
        String plaziUrl = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&taxonomicName.taxonomicName=";
72
        String plaziUrlDoc = "http://plazi.cs.umb.edu/GgServer/search?taxonomicName.isNomenclature=true&taxonomicName.exactMatch=true&indexName=0&subIndexName=taxonomicName&subIndexName=MODS&minSubResultSize=1&searchMode=index&resultFormat=xml&xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FCdmSyncTreatmentList.xslt&MODS.ModsDocID=";
73
        //        String plaziUrl = "http://plazi.cs.umb.edu/GgServer/xslt/E01DD5BE427421156E0C0BAC56389E0D?xsltUrl=http%3A%2F%2Fplazi.cs.umb.edu%2FGgServer%2FsrsWebPortalData%2FLinkers%2FXmlDocumentLinkerData%2Fgg2taxonx.xsl";
74
        List<String> sourcesStr =  new ArrayList<String>();
75
        boolean plaziNotServer=false;
76

    
77
        Map<String,List<String>> documents = new HashMap<String,List<String>>();
78
//        plaziUrl=plaziUrl+"Chenopodium";
79
       plaziUrl=plaziUrlDoc+"0910-2878-5652";
80

    
81
        /*HOW TO HANDLE SECUNDUM REFERENCE*/
82
        boolean reuseSecundum = askIfReuseSecundum();
83
        Reference<?> secundum = null;
84
        if (!reuseSecundum) {
85
            secundum = askForSecundum();
86
        }
87

    
88
        String tnomenclature = "ICBN";
89
        URL plaziURL;
90
        try {
91
            plaziURL = new URL(plaziUrl);
92
            BufferedReader in = new BufferedReader(new InputStreamReader(plaziURL.openStream()));
93

    
94
            List<String> docList;
95
            String inputLine;
96
            String docID;
97
            String pageStart;
98
            String pageEnd;
99
            String taxon;
100
            String link;
101
            //TODO lastUpdate field
102
            if(!plaziNotServer){
103
                while ((inputLine = in.readLine()) != null) {
104
                    if (inputLine.startsWith("<treatment ")){
105
                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
106
                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
107
                        link=inputLine.split("link=\"")[1].split("\"")[0];
108
                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
109
                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
110
                        docList = documents.get(docID);
111
                        if (docList == null) {
112
                            docList = new ArrayList<String>();
113
                        }
114
                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
115
                        documents.put(docID,docList);
116
                    }
117
                }
118
            }
119
            for (String docId:documents.keySet()){
120
                in = new BufferedReader(new InputStreamReader(new URL(plaziUrlDoc+docId).openStream()));
121
                while ((inputLine = in.readLine()) != null) {
122
                    if (inputLine.startsWith("<treatment ")){
123
                        taxon = inputLine.split("taxon=\"")[1].split("\"")[0];
124
                        docID=inputLine.split("docId=\"")[1].split("\"")[0];
125
                        link=inputLine.split("link=\"")[1].split("\"")[0];
126
                        pageStart = inputLine.split("startPage=\"")[1].split("\"")[0];
127
                        pageEnd = inputLine.split("endPage=\"")[1].split("\"")[0];
128
                        docList = documents.get(docID);
129
                        if (docList == null) {
130
                            docList = new ArrayList<String>();
131
                        }
132
                        docList.add(pageStart+"---"+pageEnd+"---"+taxon+"---"+link);
133
                        documents.put(docID,docList);
134
                    }
135
                }
136
            }
137
            if(plaziNotServer) {
138
                sourcesStr.add(plaziUrl);
139
            }
140
            in.close();
141
        } catch (MalformedURLException e1) {
142
            // TODO Auto-generated catch block
143
            e1.printStackTrace();
144
        } catch (IOException e) {
145
            // TODO Auto-generated catch block
146
            e.printStackTrace();
147
        }
148

    
149
        //        System.exit(0);
150

    
151
        //        sourcesStr.add("/home/pkelbert/Documents/Proibiosphere/ChenopodiumXML/1362148061170_Chenopodium_K_hn_U_1993_tx.xml");
152

    
153
        for (String docId : documents.keySet()){
154
            /*remove documents bad quality*/
155
            log.info(docId);
156
//            if (!docId.equalsIgnoreCase("3891-7797-6564")){
157
                            log.info("document "+docId);
158
                List<String> treatments = new ArrayList<String>(new HashSet<String>(documents.get(docId)));
159

    
160
                Map<Integer, List<String>> startPages = new HashMap<Integer, List<String>>();
161
                for (String treatment:treatments) {
162
                    List<String>tmplist = startPages.get(Integer.valueOf(treatment.split("---")[0]));
163
                    if (tmplist == null) {
164
                        tmplist = new ArrayList<String>();
165
                    }
166
                    tmplist.add(treatment.split("---")[3]);
167
                    startPages.put(Integer.valueOf(treatment.split("---")[0]),tmplist);
168
                }
169
                List<Integer> pages = new ArrayList<Integer>();
170
                pages.addAll(startPages.keySet());
171

    
172
                Collections.sort(pages);
173
                //            log.info(pages);
174

    
175
                log.info("Document "+docId+" should have "+treatments.size()+" treatments");
176
                int cnt=0;
177
                for (String source:treatments){
178
                    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
179
                    DocumentBuilder builder;
180
                    URL url;
181

    
182
                    try {
183
                        builder = factory.newDocumentBuilder();
184
                        url = new URL(source.split("---")[3]);
185
                        Object o = url.getContent();
186
                        InputStream is = (InputStream) o;
187
                        Document document = builder.parse(is);
188
                        cnt++;
189
                    }catch(Exception e){
190
                        //  e.printStackTrace();
191
                        log.warn(e);
192
                    }
193
                }
194
                log.info("Document "+docId+" has "+cnt+" treatments available");
195
                if(treatments.size() != cnt)
196
                {
197
                    File file = new File("/home/pkelbert/Bureau/urlTaxonXToDoLater.txt");
198
                    FileWriter writer;
199
                    try {
200
                        writer = new FileWriter(file ,true);
201
                        writer.write(docId+"\n");
202
                        writer.flush();
203
                        writer.close();
204
                    } catch (IOException e1) {
205
                        // TODO Auto-generated catch block
206
                        e1.printStackTrace();
207
                    }
208

    
209
                }
210
                else{
211
                    for (int page:pages) {
212
                        for (String treatment: startPages.get(page)) {
213
                            sourcesStr.add(treatment);
214
                        }
215
                    }
216
                }
217
//            }
218
        }
219
        log.info("NB SOURCES : "+sourcesStr.size());
220
//        sourcesStr = new ArrayList<String>();
221
//        sourcesStr.add("http://plazi.cs.umb.edu/exist/rest/db/taxonx_docs/cdmSync/4E7390346C05780D32283CCF6F5E4431_tx.xml");
222

    
223
        List<URI> sources = new ArrayList<URI>();
224
        for (String src: sourcesStr){
225
            URI uri;
226
            try {
227
                uri = new URL(src).toURI();
228
                sources.add(new URI(uri.toString()));
229
            } catch (MalformedURLException e1) {
230
                // TODO Auto-generated catch block
231
                e1.printStackTrace();
232
            } catch (URISyntaxException e1) {
233
                // TODO Auto-generated catch block
234
                e1.printStackTrace();
235
            }
236
        }
237

    
238
        log.info("Start import from  TaxonX Data");
239

    
240
        ICdmDataSource destination = cdmDestination;
241
        TaxonXImportConfigurator taxonxImportConfigurator = TaxonXImportConfigurator.NewInstance(destination);
242

    
243
        //        taxonxImportConfigurator.setClassificationName(taxonxImportConfigurator.getSourceReferenceTitle());
244
        taxonxImportConfigurator.setCheck(check);
245
        taxonxImportConfigurator.setDbSchemaValidation(hbm2dll);
246
        taxonxImportConfigurator.setDoAutomaticParsing(true);
247

    
248
        // invoke import
249
        CdmDefaultImport<TaxonXImportConfigurator> taxonImport = new CdmDefaultImport<TaxonXImportConfigurator>();
250

    
251
        taxonxImportConfigurator.setKeepOriginalSecundum(reuseSecundum);
252
        if (!reuseSecundum) {
253
            taxonxImportConfigurator.setSecundum(secundum);
254
        }
255

    
256

    
257
//        taxonxImportConfigurator.setDoMatchTaxa(true);
258
//        taxonxImportConfigurator.setReUseTaxon(true);
259

    
260
        for (URI source:sources){
261
            log.info("START : "+source.getPath());
262
            taxonxImportConfigurator.setSource(source);
263

    
264
            Reference<?> reference = ReferenceFactory.newGeneric();
265
            //            String tref = askQuestion("Import source? (ie Plazi document ID)");
266
            String tref="PLAZI - "+source.getPath().split("/")[source.getPath().split("/").length-1];
267
            reference.setTitleCache(tref,true);
268
            reference.setTitle(tref);
269
            reference.generateTitle();
270

    
271
            taxonxImportConfigurator.setSourceReference(reference);
272
            taxonxImportConfigurator.setSourceRef(reference);
273

    
274
            //            String tnomenclature = askQuestion("ICBN or ICZN ?");
275

    
276
            if (tnomenclature.equalsIgnoreCase("ICBN")) {
277
                taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICNAFP);
278
                //                taxonxImportConfigurator.setClassificationName("Chenopodiaceae");
279
            }
280
            if(tnomenclature.equalsIgnoreCase("ICZN")){
281
                taxonxImportConfigurator.setNomenclaturalCode(NomenclaturalCode.ICZN);
282
                //                taxonxImportConfigurator.setClassificationName("Ants");
283
            }
284

    
285
            //   taxonxImportConfigurator.setTaxonReference(null);
286

    
287
            //            log.info("INVOKE");
288

    
289
            taxonImport.invoke(taxonxImportConfigurator);
290
            log.info("End import from SpecimenData ("+ source.toString() + ")...");
291

    
292
            //          //deduplicate
293
            //            ICdmApplicationConfiguration app = taxonImport.getCdmAppController();
294
            //            int count = app.getAgentService().deduplicate(Person.class, null, null);
295
            //            logger.warn("Deduplicated " + count + " persons.");
296
            //            count = app.getReferenceService().deduplicate(Reference.class, null, null);
297
            //            logger.warn("Deduplicated " + count + " references.");
298
        }
299

    
300

    
301
    }
302

    
303

    
304

    
305
    /**
306
     * @return
307
     */
308
    private static boolean askIfReuseSecundum() {
309
            //        logger.info("getFullReference for "+ name);
310
            JTextArea textArea = new JTextArea("Reuse the secundum present in the current classification? " +
311
            		"\n Click Yes to reuse it, click No or Cancel to create a new one.");
312
            JScrollPane scrollPane = new JScrollPane(textArea);
313
            textArea.setLineWrap(true);
314
            textArea.setWrapStyleWord(true);
315
            scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
316

    
317
            //        JFrame frame = new JFrame("I have a question");
318
            //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
319
            int s = JOptionPane.showConfirmDialog(null, scrollPane);
320
           if (s==0) {
321
            return true;
322
        } else {
323
            return false;
324
        }
325
    }
326

    
327
    /**
328
     * @return
329
     */
330
    private static Reference<?> askForSecundum() {
331
            //        logger.info("getFullReference for "+ name);
332
            JTextArea textArea = new JTextArea("Enter the secundum name");
333
            JScrollPane scrollPane = new JScrollPane(textArea);
334
            textArea.setLineWrap(true);
335
            textArea.setWrapStyleWord(true);
336
            scrollPane.setPreferredSize( new Dimension( 700, 100 ) );
337

    
338
            //        JFrame frame = new JFrame("I have a question");
339
            //        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
340
            String s = (String) JOptionPane.showInputDialog(
341
                    null,
342
                    scrollPane,
343
                    "",
344
                    JOptionPane.PLAIN_MESSAGE,
345
                    null,
346
                    null,
347
                    null);
348
            Reference<?> ref = ReferenceFactory.newGeneric();
349
            ref.setTitle(s);
350
            return ref;
351
    }
352

    
353

    
354
}
(3-3/3)