1
|
/**
|
2
|
* Copyright (C) 2007 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
|
10
|
/**
|
11
|
* Copyright (C) 2007 EDIT
|
12
|
* European Distributed Institute of Taxonomy
|
13
|
* http://www.e-taxonomy.eu
|
14
|
*
|
15
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
16
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
17
|
*/
|
18
|
package eu.etaxonomy.cdm.app.wp6.diptera;
|
19
|
|
20
|
import java.util.ArrayList;
|
21
|
import java.util.HashSet;
|
22
|
import java.util.List;
|
23
|
import java.util.Set;
|
24
|
import java.util.regex.Pattern;
|
25
|
|
26
|
import org.apache.log4j.Logger;
|
27
|
import org.springframework.transaction.TransactionStatus;
|
28
|
|
29
|
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
|
30
|
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration;
|
31
|
import eu.etaxonomy.cdm.app.common.CdmDestinations;
|
32
|
import eu.etaxonomy.cdm.database.DbSchemaValidation;
|
33
|
import eu.etaxonomy.cdm.database.ICdmDataSource;
|
34
|
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
|
35
|
import eu.etaxonomy.cdm.model.common.Language;
|
36
|
import eu.etaxonomy.cdm.model.description.DescriptionBase;
|
37
|
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
|
38
|
import eu.etaxonomy.cdm.model.description.Distribution;
|
39
|
import eu.etaxonomy.cdm.model.description.Feature;
|
40
|
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
|
41
|
import eu.etaxonomy.cdm.model.description.TaxonDescription;
|
42
|
import eu.etaxonomy.cdm.model.description.TextData;
|
43
|
import eu.etaxonomy.cdm.model.location.NamedArea;
|
44
|
import eu.etaxonomy.cdm.model.taxon.Taxon;
|
45
|
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
|
46
|
|
47
|
/**
|
48
|
* @author a.mueller
|
49
|
* @created 17.10.2008
|
50
|
* @version 1.0
|
51
|
*/
|
52
|
public class DipteraDistributionParser {
|
53
|
private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
|
54
|
|
55
|
private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
|
56
|
|
57
|
final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
|
58
|
static Pattern pattern = null;
|
59
|
|
60
|
protected void doDistribution(ICdmApplicationConfiguration app){
|
61
|
pattern = Pattern.compile(epiSplitter);
|
62
|
TransactionStatus txStatus = app.startTransaction();
|
63
|
List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
|
64
|
for (TaxonBase taxon: taxa ){
|
65
|
if (taxon instanceof Taxon){
|
66
|
// unlazyDescription(app, (Taxon)taxon);
|
67
|
Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
|
68
|
for (DescriptionBase description: descriptions){
|
69
|
Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
|
70
|
descElements.addAll(description.getElements());
|
71
|
|
72
|
for (DescriptionElementBase descEl: descElements){
|
73
|
if (descEl.getFeature().equals(Feature.OCCURRENCE())){
|
74
|
if (descEl instanceof TextData){
|
75
|
String occString = ((TextData)descEl).getText(Language.ENGLISH());
|
76
|
parseOccurenceString(occString, description);
|
77
|
//app.getTaxonService().saveTaxon(taxon);
|
78
|
}
|
79
|
}
|
80
|
}
|
81
|
}
|
82
|
}
|
83
|
}
|
84
|
System.out.println("Unknowns: ");
|
85
|
for (String unknown: unrekognizedStrings){
|
86
|
System.out.println(unknown);
|
87
|
}
|
88
|
System.out.println("Distributions not recognized: " + countNot);
|
89
|
System.out.println("Distributions created: " + countYes);
|
90
|
app.commitTransaction(txStatus);
|
91
|
}
|
92
|
|
93
|
static Set<String> unrekognizedStrings = new HashSet<String>();
|
94
|
static int countNot = 0;
|
95
|
static int countYes = 0;
|
96
|
|
97
|
private void parseOccurenceString(String occString, DescriptionBase desc){
|
98
|
System.out.println(occString);
|
99
|
if (occString != null){
|
100
|
String[] words = pattern.split(occString);
|
101
|
int i = 0;
|
102
|
int countSkip = 0;
|
103
|
for (String word: words){
|
104
|
if (word.contains("U.S.A")){
|
105
|
logger.warn("U.S.A.");
|
106
|
}
|
107
|
boolean isDoubtful = false;
|
108
|
if (countSkip > 0){
|
109
|
countSkip--;
|
110
|
}else if(word.trim().length() == 0){
|
111
|
//skip
|
112
|
}else{
|
113
|
if (word.endsWith(":") && word.length()<=4){
|
114
|
//Higher area
|
115
|
//TODO
|
116
|
}else{
|
117
|
word = word.trim();
|
118
|
if (word.contains("?")){
|
119
|
isDoubtful = true;
|
120
|
word = word.replace("?", "");
|
121
|
}
|
122
|
word = adaptWordsToTdwg(word);
|
123
|
|
124
|
if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
|
125
|
for (countSkip = 1; countSkip <= 6; countSkip++){
|
126
|
word = word.trim();
|
127
|
if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
|
128
|
if (words.length > i + countSkip){
|
129
|
word = word + " " + words[i + countSkip];
|
130
|
}
|
131
|
if (word.contains("?")){
|
132
|
isDoubtful = true;
|
133
|
word = word.replace("?", "");
|
134
|
}
|
135
|
word = adaptWordsToTdwg(word);
|
136
|
if ("".equals(word)){
|
137
|
break;
|
138
|
}
|
139
|
}else{
|
140
|
break;
|
141
|
}
|
142
|
}
|
143
|
}
|
144
|
if ("".equals(word)){
|
145
|
//countSkip = countSkip;
|
146
|
}else if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){
|
147
|
if (word.contains("?")){
|
148
|
logger.warn("XXX");
|
149
|
}
|
150
|
countNot++;
|
151
|
System.out.println(" False:" + countNot + ": " + word);
|
152
|
unrekognizedStrings.add(word);
|
153
|
countSkip = 0;
|
154
|
}else{
|
155
|
if (word.equals("Netherlands")){
|
156
|
if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
|
157
|
word = "Netherlands Antilles";
|
158
|
countSkip=2;
|
159
|
}
|
160
|
}
|
161
|
PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
|
162
|
if (isDoubleArea(word)){
|
163
|
NamedArea[] doubleArea = getDoubleArea(word);
|
164
|
for (NamedArea area : doubleArea){
|
165
|
Distribution distr = Distribution.NewInstance(area, term);
|
166
|
desc.addElement(distr);
|
167
|
}
|
168
|
}else{
|
169
|
NamedArea area;
|
170
|
if (TdwgAreaProvider.isTdwgAreaLabel(word)){
|
171
|
area = TdwgAreaProvider.getAreaByTdwgLabel(word);
|
172
|
}else{
|
173
|
area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
|
174
|
}
|
175
|
if (isDoubtful){
|
176
|
term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
|
177
|
}
|
178
|
Distribution distr = Distribution.NewInstance(area, term);
|
179
|
desc.addElement(distr);
|
180
|
}
|
181
|
countYes++;
|
182
|
System.out.println(" True:" + countYes + ": " + word);
|
183
|
countSkip--;
|
184
|
}
|
185
|
}
|
186
|
}
|
187
|
i++;
|
188
|
}
|
189
|
}
|
190
|
}
|
191
|
|
192
|
private boolean isDoubleArea(String word){
|
193
|
if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
|
194
|
"southern Europe".equalsIgnoreCase(word) ||
|
195
|
"former USSR: North and Central European territory".equalsIgnoreCase(word)
|
196
|
){
|
197
|
return true;
|
198
|
}else{
|
199
|
return false;
|
200
|
}
|
201
|
}
|
202
|
|
203
|
private NamedArea[] getDoubleArea(String word){
|
204
|
NamedArea[] result = new NamedArea[2];
|
205
|
if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
|
206
|
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
|
207
|
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
|
208
|
}else if ("southern Europe".equalsIgnoreCase(word)){
|
209
|
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
|
210
|
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
|
211
|
}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
|
212
|
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
|
213
|
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
|
214
|
}else{
|
215
|
logger.warn("Double area not recognized");
|
216
|
}
|
217
|
return result;
|
218
|
}
|
219
|
|
220
|
|
221
|
static List<String> stopWords = new ArrayList<String>();
|
222
|
static List<String> unknownAreas = new ArrayList<String>();
|
223
|
static List<String> higherAreas = new ArrayList<String>();
|
224
|
|
225
|
private String adaptWordsToTdwg(String word){
|
226
|
word = word.replace(",", "").replace(";", "");
|
227
|
if (! word.contains("U.S.A")){
|
228
|
word = word.replace(",", "").replace(".", "").replace(";", "");
|
229
|
}else{
|
230
|
word = word.replace(",", "").replace(";", "");
|
231
|
}
|
232
|
|
233
|
word = word.trim();
|
234
|
if (word.endsWith("Is")){
|
235
|
word = word + ".";
|
236
|
}
|
237
|
if (stopWords.size() == 0){
|
238
|
initStopWords();
|
239
|
}
|
240
|
|
241
|
word = word.replace("Russia [North European territory]", "North European Russia");
|
242
|
word = word.replace("Russia North European territory", "North European Russia");
|
243
|
word = word.replace("Russia: North European territory", "North European Russia");
|
244
|
word = word.replace("Russia: North European territory", "North European Russia");
|
245
|
|
246
|
word = word.replace("Amber", "amber");
|
247
|
|
248
|
|
249
|
word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
|
250
|
//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
|
251
|
word = word.replace("Bahama Is.", "Bahamas");
|
252
|
word = word.replace("Comores Is.", "Comoros");
|
253
|
word = word.replace("former Yugoslavia", "Yugoslavia");
|
254
|
word = word.replace("former Czechoslovakia", "Czechoslovakia");
|
255
|
word = word.replace("Rhodesia", "Zimbabwe");
|
256
|
word = word.replace("The Gambia", "Gambia, The");
|
257
|
|
258
|
if (!word.contains("El Salvador")){
|
259
|
word = word.replace("Salvador", "El Salvador");
|
260
|
}
|
261
|
word = word.replace("Vera Cruz", "Veracruz");
|
262
|
word = word.replace("Turkmenia", "Turkmenistan");
|
263
|
word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
|
264
|
word = word.replace("Quebeck", "Qu\u00E9bec");
|
265
|
word = word.replace("Quebec", "Qu\u00E9bec");
|
266
|
|
267
|
if (!word.contains("Gambia, The")){
|
268
|
word = word.replace("Gambia", "Gambia, The");
|
269
|
}
|
270
|
word = word.replace("Mariana Is.", "Marianas");
|
271
|
word = word.replace("Kenia", "Kenya");
|
272
|
word = word.replace("Central Africa", "Central African Republic");
|
273
|
word = word.replace("Canal Zone", "");
|
274
|
//word = word.replace("Panama", "Panamá");
|
275
|
word = word.replace("Panama", "Panam\u00E1");
|
276
|
if (! word.contains("New South Wales")){
|
277
|
word = word.replace("Wales", "Great Britain");
|
278
|
}
|
279
|
word = word.replace("Java", "Jawa");
|
280
|
word = word.replace("former USSR: North European territory", "North European Russia");
|
281
|
word = word.replace("former USSR: South European territory", "South European Russia");
|
282
|
word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
|
283
|
|
284
|
word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
|
285
|
|
286
|
word = word.replace("oceanian islands", "Pacific");
|
287
|
word = word.replace("Ussuri region", "Primorye");
|
288
|
word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
|
289
|
word = word.replace("Tarapac\u00E1", "Tarapaca");
|
290
|
word = word.replace("Reunion", "R\u00E9union");
|
291
|
if (! word.contains("Is.")){
|
292
|
word = word.replace("Galapagos", "Gal\u00E1pagos");
|
293
|
}
|
294
|
|
295
|
//word = word.replace("Galapagos Is.", "Galápagos");
|
296
|
if (! word.contains("Peninsular")){
|
297
|
word = word.replace("Malaysia", "Peninsular Malaysia");
|
298
|
}
|
299
|
word = word.replace("Polynesic Is.", "South Solomons");
|
300
|
|
301
|
word = word.replace("Usbek SSR", "Uzbekistan");
|
302
|
word = word.replace("Mexican amber", "Mexico");
|
303
|
word = word.replace("Marocco", "Morocco");
|
304
|
if (! word.contains("Tobago")){
|
305
|
word = word.replace("Trinidad", "Trinidad-Tobago");
|
306
|
}
|
307
|
if (! word.contains("Trinidad")){
|
308
|
word = word.replace("Tobago", "Trinidad-Tobago");
|
309
|
}
|
310
|
word = word.replace("Haiti", "Haiti");
|
311
|
word = word.replace("Moluccas", "Maluku");
|
312
|
word = word.replace("Belau", "Palau");
|
313
|
word = word.replace("Dominican amber", "Dominican Republic");
|
314
|
if (! word.contains("Russian")){
|
315
|
word = word.replace("Far East", "Russian Far East");
|
316
|
}
|
317
|
word = word.replace("Tahiti", "Society Is.");
|
318
|
word = word.replace("Iraque", "Iraq");
|
319
|
word = word.replace("Wake Island", "Wake I.");
|
320
|
if (! word.contains("I.")){
|
321
|
word = word.replace("Johnston I", "Johnston I.");
|
322
|
word = word.replace("Wake I", "Wake I.");
|
323
|
word = word.replace("Clipperton I", "Clipperton I.");
|
324
|
}
|
325
|
if (! word.contains("Provinces")){
|
326
|
word = word.replace("Cape Province", "Cape Provinces");
|
327
|
}
|
328
|
word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
|
329
|
word = word.replace("Western Cape Provinces", "Western Cape Province");
|
330
|
if (! word.contains("Barbuda")){
|
331
|
word = word.replace("Antigua", "Antigua-Barbuda");
|
332
|
}
|
333
|
if (! word.contains("St.")){
|
334
|
word = word.replace("St Vincent", "St.Vincent");
|
335
|
word = word.replace("St Lucia", "St.Lucia");
|
336
|
word = word.replace("St Helena", "St.Helena");
|
337
|
}
|
338
|
word = word.replace("Asia-tropical", "Asia-Tropical");
|
339
|
word = word.replace("Society Islands", "Society Is.");
|
340
|
word = word.replace("Virgin Islands", "Virgin Is.");
|
341
|
word = word.replace("Canary Islands", "Canary Is.");
|
342
|
word = word.replace("Rhode Island", "Rhode I.");
|
343
|
|
344
|
|
345
|
word = word.replace("Rodriguez", "Rodrigues");
|
346
|
word = word.replace("British Colombia", "British Columbia");
|
347
|
word = word.replace("Bermudas", "Bermuda");
|
348
|
word = word.replace("Tunesia", "Tunisia");
|
349
|
word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
|
350
|
word = word.replace("Transvaal", "Northern Provinces");
|
351
|
word = word.replace("Tucum\u00E1n", "Tucuman");
|
352
|
// if (!word.contains("Netherlands")){
|
353
|
//
|
354
|
// }
|
355
|
|
356
|
// unknownAreas.add("Baltic amber");
|
357
|
// unknownAreas.add("Arabia");
|
358
|
|
359
|
for (String stopWord : stopWords){
|
360
|
if (stopWord.equals(word)){
|
361
|
System.out.println(" STOP: " + word);
|
362
|
return "";
|
363
|
}
|
364
|
}
|
365
|
for (String unknownArea : unknownAreas){
|
366
|
if (unknownArea.equals(word)){
|
367
|
System.out.println(" UNKNOWN: " + word);
|
368
|
return "";
|
369
|
}
|
370
|
}
|
371
|
for (String higherArea : higherAreas){
|
372
|
if (higherArea.equals(word)){
|
373
|
return "";
|
374
|
}
|
375
|
}
|
376
|
|
377
|
//higher regions
|
378
|
|
379
|
return word;
|
380
|
}
|
381
|
|
382
|
private void initStopWords(){
|
383
|
stopWords.add("and");
|
384
|
stopWords.add("Is");
|
385
|
stopWords.add("Is.");
|
386
|
stopWords.add("Islands");
|
387
|
stopWords.add("Island");
|
388
|
|
389
|
stopWords.add("of");
|
390
|
stopWords.add("areas");
|
391
|
stopWords.add("USA");
|
392
|
stopWords.add("Australia"); //except for Australia only
|
393
|
stopWords.add("Argentina");
|
394
|
|
395
|
//unknownAreas.add("Panama");
|
396
|
unknownAreas.add("South Africa");
|
397
|
unknownAreas.add("Chile");
|
398
|
|
399
|
unknownAreas.add("Baltic amber");
|
400
|
unknownAreas.add("Arabia");
|
401
|
|
402
|
|
403
|
higherAreas.add("AF");
|
404
|
higherAreas.add("OR");
|
405
|
higherAreas.add("PA");
|
406
|
higherAreas.add("AU");
|
407
|
higherAreas.add("NE");
|
408
|
|
409
|
higherAreas.add("NT");
|
410
|
}
|
411
|
|
412
|
|
413
|
/**
|
414
|
* @param args
|
415
|
*/
|
416
|
public static void main(String[] args) {
|
417
|
CdmApplicationController app = null;
|
418
|
DbSchemaValidation val = DbSchemaValidation.UPDATE;
|
419
|
app = CdmApplicationController.NewInstance(cdmDestination, val);
|
420
|
|
421
|
DipteraDistributionParser dipDist = new DipteraDistributionParser();
|
422
|
if (app != null){
|
423
|
dipDist.doDistribution(app);
|
424
|
}else{
|
425
|
logger.warn("No Application Context");
|
426
|
}
|
427
|
}
|
428
|
}
|