Project

General

Profile

Download (14.6 KB) Statistics
| Branch: | Revision:
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy 
4
* http://www.e-taxonomy.eu
5
* 
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
/**
11
* Copyright (C) 2007 EDIT
12
* European Distributed Institute of Taxonomy 
13
* http://www.e-taxonomy.eu
14
* 
15
* The contents of this file are subject to the Mozilla Public License Version 1.1
16
* See LICENSE.TXT at the top of this package for the full license terms.
17
*/
18
package eu.etaxonomy.cdm.app.wp6.diptera;
19

    
20
import java.util.ArrayList;
21
import java.util.HashSet;
22
import java.util.List;
23
import java.util.Set;
24
import java.util.regex.Pattern;
25

    
26
import org.apache.log4j.Logger;
27
import org.springframework.transaction.TransactionStatus;
28

    
29
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration;
31
import eu.etaxonomy.cdm.app.common.CdmDestinations;
32
import eu.etaxonomy.cdm.database.DbSchemaValidation;
33
import eu.etaxonomy.cdm.database.ICdmDataSource;
34
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
35
import eu.etaxonomy.cdm.model.common.Language;
36
import eu.etaxonomy.cdm.model.description.DescriptionBase;
37
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38
import eu.etaxonomy.cdm.model.description.Distribution;
39
import eu.etaxonomy.cdm.model.description.Feature;
40
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
41
import eu.etaxonomy.cdm.model.description.TaxonDescription;
42
import eu.etaxonomy.cdm.model.description.TextData;
43
import eu.etaxonomy.cdm.model.location.NamedArea;
44
import eu.etaxonomy.cdm.model.taxon.Taxon;
45
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46

    
47
/**
48
 * @author a.mueller
49
 * @created 17.10.2008
50
 * @version 1.0
51
 */
52
public class DipteraDistributionParser {
53
	private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
54
	
55
	private static ICdmDataSource cdmDestination = CdmDestinations.cdm_local_dipera();
56

    
57
	final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
58
	static Pattern pattern = null;
59
	
60
	protected void doDistribution(ICdmApplicationConfiguration app){
61
		pattern = Pattern.compile(epiSplitter); 
62
	    TransactionStatus txStatus = app.startTransaction();
63
		List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
64
		for (TaxonBase taxon: taxa ){
65
			if (taxon instanceof Taxon){
66
		//		unlazyDescription(app, (Taxon)taxon);
67
				Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
68
				for (DescriptionBase description: descriptions){
69
					Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
70
					descElements.addAll(description.getElements());
71
					
72
					for (DescriptionElementBase descEl: descElements){
73
						if (descEl.getFeature().equals(Feature.OCCURRENCE())){
74
							if (descEl instanceof TextData){
75
								String occString = ((TextData)descEl).getText(Language.ENGLISH());
76
								parseOccurenceString(occString, description);
77
								//app.getTaxonService().saveTaxon(taxon);
78
							}
79
						}
80
					}
81
				}
82
			}
83
		}
84
		System.out.println("Unknowns: ");
85
		for (String unknown: unrekognizedStrings){
86
			System.out.println(unknown);
87
		}
88
		System.out.println("Distributions not recognized: " + countNot);
89
		System.out.println("Distributions created: " + countYes);
90
		app.commitTransaction(txStatus);
91
	}
92
	
93
	static Set<String> unrekognizedStrings = new HashSet<String>();
94
	static int countNot = 0;
95
	static int countYes = 0;
96
	
97
	private void parseOccurenceString(String occString, DescriptionBase desc){
98
		System.out.println(occString);
99
		if (occString != null){
100
			String[] words = pattern.split(occString);
101
			int i = 0;
102
			int countSkip = 0;
103
			for (String word: words){
104
				if (word.contains("U.S.A")){
105
					logger.warn("U.S.A.");
106
				}
107
				boolean isDoubtful = false;
108
				if (countSkip > 0){
109
					countSkip--;
110
				}else if(word.trim().length() == 0){
111
					//skip
112
				}else{
113
					if (word.endsWith(":") && word.length()<=4){
114
						//Higher area
115
						//TODO
116
					}else{
117
						word = word.trim();
118
						if (word.contains("?")){
119
							isDoubtful = true;
120
							word = word.replace("?", "");
121
						}
122
						word = adaptWordsToTdwg(word);
123
						
124
						if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
125
							for (countSkip = 1; countSkip <= 6; countSkip++){
126
								word = word.trim();
127
								if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
128
									if (words.length > i + countSkip){
129
										word = word + " " + words[i + countSkip];
130
									}
131
									if (word.contains("?")){
132
										isDoubtful = true;
133
										word = word.replace("?", "");
134
									}
135
									word = adaptWordsToTdwg(word);
136
									if ("".equals(word)){
137
										break;
138
									}
139
								}else{
140
									break;
141
								}
142
							}
143
						}
144
						if ("".equals(word)){
145
							//countSkip = countSkip;
146
						}else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
147
							if (word.contains("?")){
148
								logger.warn("XXX");
149
							}
150
							countNot++;
151
							System.out.println("   False:" + countNot + ": " + word);
152
							unrekognizedStrings.add(word);
153
							countSkip = 0;
154
						}else{
155
							if (word.equals("Netherlands")){
156
								if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
157
									word = "Netherlands Antilles";
158
									countSkip=2;
159
								}
160
							}
161
							PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
162
							if (isDoubleArea(word)){
163
								NamedArea[] doubleArea = getDoubleArea(word);
164
								for (NamedArea area : doubleArea){
165
									Distribution distr = Distribution.NewInstance(area, term);
166
									desc.addElement(distr);
167
								}
168
							}else{
169
								NamedArea area;
170
								if (TdwgAreaProvider.isTdwgAreaLabel(word)){
171
									area = TdwgAreaProvider.getAreaByTdwgLabel(word);
172
								}else{
173
									area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
174
								}
175
								if (isDoubtful){
176
									term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
177
								}
178
								Distribution distr = Distribution.NewInstance(area, term);
179
								desc.addElement(distr);
180
							}
181
							countYes++;
182
							System.out.println("      True:" + countYes + ": " + word);
183
							countSkip--;
184
						}
185
					}
186
				}
187
				i++;
188
			}
189
		}
190
	}
191
	
192
	private boolean isDoubleArea(String word){
193
		if ("Canary and Madeira Is.".equalsIgnoreCase(word) || 
194
				"southern Europe".equalsIgnoreCase(word) ||
195
				"former USSR: North and Central European territory".equalsIgnoreCase(word)
196
				){
197
			return true;
198
		}else{
199
			return false;
200
		}
201
	}
202
	
203
	private NamedArea[] getDoubleArea(String word){
204
		NamedArea[] result = new NamedArea[2];
205
		if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
206
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
207
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
208
		}else if ("southern Europe".equalsIgnoreCase(word)){
209
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
210
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
211
		}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
212
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
213
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
214
		}else{
215
			logger.warn("Double area not recognized");
216
		}
217
		return result;
218
	}
219
	
220
	
221
	static List<String> stopWords = new ArrayList<String>();
222
	static List<String> unknownAreas = new ArrayList<String>();
223
	static List<String> higherAreas = new ArrayList<String>();
224
	
225
	private String adaptWordsToTdwg(String word){
226
		word = word.replace(",", "").replace(";", "");
227
		if (! word.contains("U.S.A")){
228
			word = word.replace(",", "").replace(".", "").replace(";", "");
229
		}else{
230
			word = word.replace(",", "").replace(";", "");
231
		}
232
		
233
		word = word.trim();
234
		if (word.endsWith("Is")){
235
			word = word + ".";
236
		}
237
		if (stopWords.size() == 0){
238
			initStopWords();
239
		}
240
		
241
		word = word.replace("Russia [North European territory]", "North European Russia");
242
		word = word.replace("Russia North European territory", "North European Russia");
243
		word = word.replace("Russia: North European territory", "North European Russia");
244
		word = word.replace("Russia: North European territory", "North European Russia");
245
				
246
		word = word.replace("Amber", "amber");
247
		
248
		
249
		word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
250
		//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
251
		word = word.replace("Bahama Is.", "Bahamas");
252
		word = word.replace("Comores Is.", "Comoros");
253
		word = word.replace("former Yugoslavia", "Yugoslavia");
254
		word = word.replace("former Czechoslovakia", "Czechoslovakia");
255
		word = word.replace("Rhodesia", "Zimbabwe");
256
		word = word.replace("The Gambia", "Gambia, The");
257

    
258
		if (!word.contains("El Salvador")){
259
			word = word.replace("Salvador", "El Salvador");	
260
		}
261
		word = word.replace("Vera Cruz", "Veracruz");
262
		word = word.replace("Turkmenia", "Turkmenistan");
263
		word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
264
		word = word.replace("Quebeck", "Qu\u00E9bec");
265
		word = word.replace("Quebec", "Qu\u00E9bec");
266
		
267
		if (!word.contains("Gambia, The")){
268
			word = word.replace("Gambia", "Gambia, The");
269
		}
270
		word = word.replace("Mariana Is.", "Marianas");
271
		word = word.replace("Kenia", "Kenya");
272
		word = word.replace("Central Africa", "Central African Republic");
273
		word = word.replace("Canal Zone", "");
274
		//word = word.replace("Panama", "Panamá");
275
		word = word.replace("Panama", "Panam\u00E1");
276
		if (! word.contains("New South Wales")){
277
			word = word.replace("Wales", "Great Britain");
278
		}
279
		word = word.replace("Java", "Jawa");
280
		word = word.replace("former USSR: North European territory", "North European Russia");
281
		word = word.replace("former USSR: South European territory", "South European Russia");
282
		word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
283
		
284
		word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
285
		
286
		word = word.replace("oceanian islands", "Pacific");
287
		word = word.replace("Ussuri region", "Primorye");
288
		word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
289
		word = word.replace("Tarapac\u00E1", "Tarapaca");
290
		word = word.replace("Reunion", "R\u00E9union");
291
		if (! word.contains("Is.")){
292
			word = word.replace("Galapagos", "Gal\u00E1pagos");
293
		}
294
		
295
		//word = word.replace("Galapagos Is.", "Galápagos");
296
		if (! word.contains("Peninsular")){
297
			word = word.replace("Malaysia", "Peninsular Malaysia");
298
		}
299
		word = word.replace("Polynesic Is.", "South Solomons");
300
		
301
		word = word.replace("Usbek SSR", "Uzbekistan");
302
		word = word.replace("Mexican amber", "Mexico");
303
		word = word.replace("Marocco", "Morocco");
304
		if (! word.contains("Tobago")){
305
			word = word.replace("Trinidad", "Trinidad-Tobago");
306
		}
307
		if (! word.contains("Trinidad")){
308
			word = word.replace("Tobago", "Trinidad-Tobago");
309
		}
310
		word = word.replace("Haiti", "Haiti");  
311
		word = word.replace("Moluccas", "Maluku");
312
		word = word.replace("Belau", "Palau");
313
		word = word.replace("Dominican amber", "Dominican Republic");
314
		if (! word.contains("Russian")){
315
			word = word.replace("Far East", "Russian Far East");
316
		}
317
		word = word.replace("Tahiti", "Society Is.");
318
		word = word.replace("Iraque", "Iraq");
319
		word = word.replace("Wake Island", "Wake I.");
320
		if (! word.contains("I.")){
321
			word = word.replace("Johnston I", "Johnston I.");
322
			word = word.replace("Wake I", "Wake I.");
323
			word = word.replace("Clipperton I", "Clipperton I.");
324
		}
325
		if (! word.contains("Provinces")){
326
			word = word.replace("Cape Province", "Cape Provinces");
327
		}
328
		word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
329
		word = word.replace("Western Cape Provinces", "Western Cape Province");
330
		if (! word.contains("Barbuda")){
331
			word = word.replace("Antigua", "Antigua-Barbuda");
332
		}
333
		if (! word.contains("St.")){
334
			word = word.replace("St Vincent", "St.Vincent");
335
			word = word.replace("St Lucia", "St.Lucia");
336
			word = word.replace("St Helena", "St.Helena");
337
		}
338
		word = word.replace("Asia-tropical", "Asia-Tropical");
339
		word = word.replace("Society Islands", "Society Is.");
340
		word = word.replace("Virgin Islands", "Virgin Is.");
341
		word = word.replace("Canary Islands", "Canary Is.");
342
		word = word.replace("Rhode Island", "Rhode I.");
343
		
344
		
345
		word = word.replace("Rodriguez", "Rodrigues");
346
		word = word.replace("British Colombia", "British Columbia");
347
		word = word.replace("Bermudas", "Bermuda");
348
		word = word.replace("Tunesia", "Tunisia");
349
		word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
350
		word = word.replace("Transvaal", "Northern Provinces");
351
		word = word.replace("Tucum\u00E1n", "Tucuman");
352
//		if (!word.contains("Netherlands")){
353
//			
354
//		}
355
		
356
//		unknownAreas.add("Baltic amber");  
357
//		unknownAreas.add("Arabia"); 
358
						
359
		for (String stopWord : stopWords){
360
			if (stopWord.equals(word)){
361
				System.out.println("         STOP: " + word);
362
				return "";
363
			}
364
		}
365
		for (String unknownArea : unknownAreas){
366
			if (unknownArea.equals(word)){
367
				System.out.println("         UNKNOWN: " + word);
368
				return "";
369
			}
370
		}
371
		for (String higherArea : higherAreas){
372
			if (higherArea.equals(word)){
373
				return "";
374
			}
375
		}
376
		
377
		//higher regions
378
		
379
		return word;
380
	}
381
	
382
	private void initStopWords(){
383
		stopWords.add("and");
384
		stopWords.add("Is");
385
		stopWords.add("Is.");
386
		stopWords.add("Islands");
387
		stopWords.add("Island");
388
		
389
		stopWords.add("of");
390
		stopWords.add("areas");
391
		stopWords.add("USA");
392
		stopWords.add("Australia"); //except for Australia only
393
		stopWords.add("Argentina");		
394

    
395
		//unknownAreas.add("Panama");
396
		unknownAreas.add("South Africa");
397
		unknownAreas.add("Chile");
398

    
399
		unknownAreas.add("Baltic amber");  
400
		unknownAreas.add("Arabia"); 
401

    
402
			
403
		higherAreas.add("AF");
404
		higherAreas.add("OR");
405
		higherAreas.add("PA");
406
		higherAreas.add("AU");
407
		higherAreas.add("NE");
408
		
409
		higherAreas.add("NT");
410
	}
411

    
412
	
413
	/**
414
	 * @param args
415
	 */
416
	public static void main(String[] args) {
417
		CdmApplicationController app = null;
418
		DbSchemaValidation val = DbSchemaValidation.UPDATE;
419
		app = CdmApplicationController.NewInstance(cdmDestination, val);
420
		
421
		DipteraDistributionParser dipDist = new DipteraDistributionParser();
422
		if (app != null){
423
			dipDist.doDistribution(app);
424
		}else{
425
			logger.warn("No Application Context");
426
		}
427
	}
428
}
(3-3/4)