Project

General

Profile

Download (14 KB) Statistics
| Branch: | Revision:
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
/**
11
* Copyright (C) 2007 EDIT
12
* European Distributed Institute of Taxonomy
13
* http://www.e-taxonomy.eu
14
*
15
* The contents of this file are subject to the Mozilla Public License Version 1.1
16
* See LICENSE.TXT at the top of this package for the full license terms.
17
*/
18
package eu.etaxonomy.cdm.app.wp6.diptera;
19

    
20
import java.util.ArrayList;
21
import java.util.HashSet;
22
import java.util.List;
23
import java.util.Set;
24
import java.util.regex.Pattern;
25

    
26
import org.apache.log4j.Logger;
27
import org.springframework.transaction.TransactionStatus;
28

    
29
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30
import eu.etaxonomy.cdm.api.application.ICdmRepository;
31
import eu.etaxonomy.cdm.app.common.CdmDestinations;
32
import eu.etaxonomy.cdm.database.DbSchemaValidation;
33
import eu.etaxonomy.cdm.database.ICdmDataSource;
34
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
35
import eu.etaxonomy.cdm.model.common.Language;
36
import eu.etaxonomy.cdm.model.description.DescriptionBase;
37
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38
import eu.etaxonomy.cdm.model.description.Distribution;
39
import eu.etaxonomy.cdm.model.description.Feature;
40
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
41
import eu.etaxonomy.cdm.model.description.TaxonDescription;
42
import eu.etaxonomy.cdm.model.description.TextData;
43
import eu.etaxonomy.cdm.model.location.NamedArea;
44
import eu.etaxonomy.cdm.model.taxon.Taxon;
45
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46

    
47
/**
48
 * @author a.mueller
49
 * @since 17.10.2008
50
 */
51
public class DipteraDistributionParser {
52
	private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
53

    
54
	private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
55

    
56
	final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
57
	static Pattern pattern = null;
58

    
59
	protected void doDistribution(ICdmRepository app){
60
		pattern = Pattern.compile(epiSplitter);
61
	    TransactionStatus txStatus = app.startTransaction();
62
		List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
63
		for (TaxonBase taxon: taxa ){
64
			if (taxon instanceof Taxon){
65
		//		unlazyDescription(app, (Taxon)taxon);
66
				Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
67
				for (DescriptionBase description: descriptions){
68
					Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
69
					descElements.addAll(description.getElements());
70

    
71
					for (DescriptionElementBase descEl: descElements){
72
						if (descEl.getFeature().equals(Feature.OCCURRENCE())){
73
							if (descEl instanceof TextData){
74
								String occString = ((TextData)descEl).getText(Language.ENGLISH());
75
								parseOccurenceString(occString, description);
76
								//app.getTaxonService().saveTaxon(taxon);
77
							}
78
						}
79
					}
80
				}
81
			}
82
		}
83
		System.out.println("Unknowns: ");
84
		for (String unknown: unrekognizedStrings){
85
			System.out.println(unknown);
86
		}
87
		System.out.println("Distributions not recognized: " + countNot);
88
		System.out.println("Distributions created: " + countYes);
89
		app.commitTransaction(txStatus);
90
	}
91

    
92
	static Set<String> unrekognizedStrings = new HashSet<>();
93
	static int countNot = 0;
94
	static int countYes = 0;
95

    
96
	private void parseOccurenceString(String occString, DescriptionBase desc){
97
		System.out.println(occString);
98
		if (occString != null){
99
			String[] words = pattern.split(occString);
100
			int i = 0;
101
			int countSkip = 0;
102
			for (String word: words){
103
				if (word.contains("U.S.A")){
104
					logger.warn("U.S.A.");
105
				}
106
				boolean isDoubtful = false;
107
				if (countSkip > 0){
108
					countSkip--;
109
				}else if(word.trim().length() == 0){
110
					//skip
111
				}else{
112
					if (word.endsWith(":") && word.length()<=4){
113
						//Higher area
114
						//TODO
115
					}else{
116
						word = word.trim();
117
						if (word.contains("?")){
118
							isDoubtful = true;
119
							word = word.replace("?", "");
120
						}
121
						word = adaptWordsToTdwg(word);
122

    
123
						if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
124
							for (countSkip = 1; countSkip <= 6; countSkip++){
125
								word = word.trim();
126
								if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
127
									if (words.length > i + countSkip){
128
										word = word + " " + words[i + countSkip];
129
									}
130
									if (word.contains("?")){
131
										isDoubtful = true;
132
										word = word.replace("?", "");
133
									}
134
									word = adaptWordsToTdwg(word);
135
									if ("".equals(word)){
136
										break;
137
									}
138
								}else{
139
									break;
140
								}
141
							}
142
						}
143
						if ("".equals(word)){
144
							//countSkip = countSkip;
145
						}else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
146
							if (word.contains("?")){
147
								logger.warn("XXX");
148
							}
149
							countNot++;
150
							System.out.println("   False:" + countNot + ": " + word);
151
							unrekognizedStrings.add(word);
152
							countSkip = 0;
153
						}else{
154
							if (word.equals("Netherlands")){
155
								if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
156
									word = "Netherlands Antilles";
157
									countSkip=2;
158
								}
159
							}
160
							PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
161
							if (isDoubleArea(word)){
162
								NamedArea[] doubleArea = getDoubleArea(word);
163
								for (NamedArea area : doubleArea){
164
									Distribution distr = Distribution.NewInstance(area, term);
165
									desc.addElement(distr);
166
								}
167
							}else{
168
								NamedArea area;
169
								if (TdwgAreaProvider.isTdwgAreaLabel(word)){
170
									area = TdwgAreaProvider.getAreaByTdwgLabel(word);
171
								}else{
172
									area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
173
								}
174
								if (isDoubtful){
175
									term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
176
								}
177
								Distribution distr = Distribution.NewInstance(area, term);
178
								desc.addElement(distr);
179
							}
180
							countYes++;
181
							System.out.println("      True:" + countYes + ": " + word);
182
							countSkip--;
183
						}
184
					}
185
				}
186
				i++;
187
			}
188
		}
189
	}
190

    
191
	private boolean isDoubleArea(String word){
192
		if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
193
				"southern Europe".equalsIgnoreCase(word) ||
194
				"former USSR: North and Central European territory".equalsIgnoreCase(word)
195
				){
196
			return true;
197
		}else{
198
			return false;
199
		}
200
	}
201

    
202
	private NamedArea[] getDoubleArea(String word){
203
		NamedArea[] result = new NamedArea[2];
204
		if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
205
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
206
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
207
		}else if ("southern Europe".equalsIgnoreCase(word)){
208
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
209
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
210
		}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
211
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
212
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
213
		}else{
214
			logger.warn("Double area not recognized");
215
		}
216
		return result;
217
	}
218

    
219

    
220
	static List<String> stopWords = new ArrayList<>();
221
	static List<String> unknownAreas = new ArrayList<>();
222
	static List<String> higherAreas = new ArrayList<>();
223

    
224
	private String adaptWordsToTdwg(String word){
225
		word = word.replace(",", "").replace(";", "");
226
		if (! word.contains("U.S.A")){
227
			word = word.replace(",", "").replace(".", "").replace(";", "");
228
		}else{
229
			word = word.replace(",", "").replace(";", "");
230
		}
231

    
232
		word = word.trim();
233
		if (word.endsWith("Is")){
234
			word = word + ".";
235
		}
236
		if (stopWords.size() == 0){
237
			initStopWords();
238
		}
239

    
240
		word = word.replace("Russia [North European territory]", "North European Russia");
241
		word = word.replace("Russia North European territory", "North European Russia");
242
		word = word.replace("Russia: North European territory", "North European Russia");
243
		word = word.replace("Russia: North European territory", "North European Russia");
244

    
245
		word = word.replace("Amber", "amber");
246

    
247

    
248
		word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
249
		//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
250
		word = word.replace("Bahama Is.", "Bahamas");
251
		word = word.replace("Comores Is.", "Comoros");
252
		word = word.replace("former Yugoslavia", "Yugoslavia");
253
		word = word.replace("former Czechoslovakia", "Czechoslovakia");
254
		word = word.replace("Rhodesia", "Zimbabwe");
255
		word = word.replace("The Gambia", "Gambia, The");
256

    
257
		if (!word.contains("El Salvador")){
258
			word = word.replace("Salvador", "El Salvador");
259
		}
260
		word = word.replace("Vera Cruz", "Veracruz");
261
		word = word.replace("Turkmenia", "Turkmenistan");
262
		word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
263
		word = word.replace("Quebeck", "Qu\u00E9bec");
264
		word = word.replace("Quebec", "Qu\u00E9bec");
265

    
266
		if (!word.contains("Gambia, The")){
267
			word = word.replace("Gambia", "Gambia, The");
268
		}
269
		word = word.replace("Mariana Is.", "Marianas");
270
		word = word.replace("Kenia", "Kenya");
271
		word = word.replace("Central Africa", "Central African Republic");
272
		word = word.replace("Canal Zone", "");
273
		//word = word.replace("Panama", "Panamá");
274
		word = word.replace("Panama", "Panam\u00E1");
275
		if (! word.contains("New South Wales")){
276
			word = word.replace("Wales", "Great Britain");
277
		}
278
		word = word.replace("Java", "Jawa");
279
		word = word.replace("former USSR: North European territory", "North European Russia");
280
		word = word.replace("former USSR: South European territory", "South European Russia");
281
		word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
282

    
283
		word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
284

    
285
		word = word.replace("oceanian islands", "Pacific");
286
		word = word.replace("Ussuri region", "Primorye");
287
		word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
288
		word = word.replace("Tarapac\u00E1", "Tarapaca");
289
		word = word.replace("Reunion", "R\u00E9union");
290
		if (! word.contains("Is.")){
291
			word = word.replace("Galapagos", "Gal\u00E1pagos");
292
		}
293

    
294
		//word = word.replace("Galapagos Is.", "Galápagos");
295
		if (! word.contains("Peninsular")){
296
			word = word.replace("Malaysia", "Peninsular Malaysia");
297
		}
298
		word = word.replace("Polynesic Is.", "South Solomons");
299

    
300
		word = word.replace("Usbek SSR", "Uzbekistan");
301
		word = word.replace("Mexican amber", "Mexico");
302
		word = word.replace("Marocco", "Morocco");
303
		if (! word.contains("Tobago")){
304
			word = word.replace("Trinidad", "Trinidad-Tobago");
305
		}
306
		if (! word.contains("Trinidad")){
307
			word = word.replace("Tobago", "Trinidad-Tobago");
308
		}
309
		word = word.replace("Haiti", "Haiti");
310
		word = word.replace("Moluccas", "Maluku");
311
		word = word.replace("Belau", "Palau");
312
		word = word.replace("Dominican amber", "Dominican Republic");
313
		if (! word.contains("Russian")){
314
			word = word.replace("Far East", "Russian Far East");
315
		}
316
		word = word.replace("Tahiti", "Society Is.");
317
		word = word.replace("Iraque", "Iraq");
318
		word = word.replace("Wake Island", "Wake I.");
319
		if (! word.contains("I.")){
320
			word = word.replace("Johnston I", "Johnston I.");
321
			word = word.replace("Wake I", "Wake I.");
322
			word = word.replace("Clipperton I", "Clipperton I.");
323
		}
324
		if (! word.contains("Provinces")){
325
			word = word.replace("Cape Province", "Cape Provinces");
326
		}
327
		word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
328
		word = word.replace("Western Cape Provinces", "Western Cape Province");
329
		if (! word.contains("Barbuda")){
330
			word = word.replace("Antigua", "Antigua-Barbuda");
331
		}
332
		if (! word.contains("St.")){
333
			word = word.replace("St Vincent", "St.Vincent");
334
			word = word.replace("St Lucia", "St.Lucia");
335
			word = word.replace("St Helena", "St.Helena");
336
		}
337
		word = word.replace("Asia-tropical", "Asia-Tropical");
338
		word = word.replace("Society Islands", "Society Is.");
339
		word = word.replace("Virgin Islands", "Virgin Is.");
340
		word = word.replace("Canary Islands", "Canary Is.");
341
		word = word.replace("Rhode Island", "Rhode I.");
342

    
343

    
344
		word = word.replace("Rodriguez", "Rodrigues");
345
		word = word.replace("British Colombia", "British Columbia");
346
		word = word.replace("Bermudas", "Bermuda");
347
		word = word.replace("Tunesia", "Tunisia");
348
		word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
349
		word = word.replace("Transvaal", "Northern Provinces");
350
		word = word.replace("Tucum\u00E1n", "Tucuman");
351
//		if (!word.contains("Netherlands")){
352
//
353
//		}
354

    
355
//		unknownAreas.add("Baltic amber");
356
//		unknownAreas.add("Arabia");
357

    
358
		for (String stopWord : stopWords){
359
			if (stopWord.equals(word)){
360
				System.out.println("         STOP: " + word);
361
				return "";
362
			}
363
		}
364
		for (String unknownArea : unknownAreas){
365
			if (unknownArea.equals(word)){
366
				System.out.println("         UNKNOWN: " + word);
367
				return "";
368
			}
369
		}
370
		for (String higherArea : higherAreas){
371
			if (higherArea.equals(word)){
372
				return "";
373
			}
374
		}
375

    
376
		//higher regions
377

    
378
		return word;
379
	}
380

    
381
	private void initStopWords(){
382
		stopWords.add("and");
383
		stopWords.add("Is");
384
		stopWords.add("Is.");
385
		stopWords.add("Islands");
386
		stopWords.add("Island");
387

    
388
		stopWords.add("of");
389
		stopWords.add("areas");
390
		stopWords.add("USA");
391
		stopWords.add("Australia"); //except for Australia only
392
		stopWords.add("Argentina");
393

    
394
		//unknownAreas.add("Panama");
395
		unknownAreas.add("South Africa");
396
		unknownAreas.add("Chile");
397

    
398
		unknownAreas.add("Baltic amber");
399
		unknownAreas.add("Arabia");
400

    
401

    
402
		higherAreas.add("AF");
403
		higherAreas.add("OR");
404
		higherAreas.add("PA");
405
		higherAreas.add("AU");
406
		higherAreas.add("NE");
407

    
408
		higherAreas.add("NT");
409
	}
410

    
411
	public static void main(String[] args) {
412
		CdmApplicationController app = null;
413
		DbSchemaValidation val = DbSchemaValidation.UPDATE;
414
		app = CdmApplicationController.NewInstance(cdmDestination, val);
415

    
416
		DipteraDistributionParser dipDist = new DipteraDistributionParser();
417
		if (app != null){
418
			dipDist.doDistribution(app);
419
		}else{
420
			logger.warn("No Application Context");
421
		}
422
	}
423
}
(3-3/4)