Project

General

Profile

Download (14 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy 
4
* http://www.e-taxonomy.eu
5
* 
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
package eu.etaxonomy.cdm.app.berlinModelImport;
11

    
12
import java.util.ArrayList;
13
import java.util.HashSet;
14
import java.util.List;
15
import java.util.Set;
16
import java.util.regex.Pattern;
17

    
18
import org.apache.log4j.Logger;
19
import org.springframework.transaction.TransactionStatus;
20

    
21
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
22
import eu.etaxonomy.cdm.app.common.CdmDestinations;
23
import eu.etaxonomy.cdm.database.DataSourceNotFoundException;
24
import eu.etaxonomy.cdm.database.DbSchemaValidation;
25
import eu.etaxonomy.cdm.database.ICdmDataSource;
26
import eu.etaxonomy.cdm.model.common.Language;
27
import eu.etaxonomy.cdm.model.common.init.TermNotFoundException;
28
import eu.etaxonomy.cdm.model.description.DescriptionBase;
29
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
30
import eu.etaxonomy.cdm.model.description.Distribution;
31
import eu.etaxonomy.cdm.model.description.Feature;
32
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
33
import eu.etaxonomy.cdm.model.description.PresenceTerm;
34
import eu.etaxonomy.cdm.model.description.TaxonDescription;
35
import eu.etaxonomy.cdm.model.description.TextData;
36
import eu.etaxonomy.cdm.model.location.NamedArea;
37
import eu.etaxonomy.cdm.model.location.TdwgArea;
38
import eu.etaxonomy.cdm.model.taxon.Taxon;
39
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
40

    
41
/**
42
 * @author a.mueller
43
 * @created 17.10.2008
44
 * @version 1.0
45
 */
46
public class DipteraDistributionParser {
47
	private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
48
	
49
	final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
50
	static Pattern pattern = null;
51
	
52
	protected void doDistribution(CdmApplicationController app){
53
		pattern = Pattern.compile(epiSplitter); 
54
	    TransactionStatus txStatus = app.startTransaction();
55
		List<TaxonBase> taxa = app.getTaxonService().getAllTaxonBases(1000000, 0);
56
		for (TaxonBase taxon: taxa ){
57
			if (taxon instanceof Taxon){
58
		//		unlazyDescription(app, (Taxon)taxon);
59
				Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
60
				for (DescriptionBase description: descriptions){
61
					Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
62
					descElements.addAll(description.getElements());
63
					
64
					for (DescriptionElementBase descEl: descElements){
65
						if (descEl.getFeature().equals(Feature.OCCURRENCE())){
66
							if (descEl instanceof TextData){
67
								String occString = ((TextData)descEl).getText(Language.ENGLISH());
68
								parseOccurenceString(occString, description);
69
								//app.getTaxonService().saveTaxon(taxon);
70
							}
71
						}
72
					}
73
				}
74
			}
75
		}
76
		System.out.println("Unknowns: ");
77
		for (String unknown: unrekognizedStrings){
78
			System.out.println(unknown);
79
		}
80
		System.out.println("Distributions not recognized: " + countNot);
81
		System.out.println("Distributions created: " + countYes);
82
		app.commitTransaction(txStatus);
83
	}
84
	
85
	static Set<String> unrekognizedStrings = new HashSet<String>();
86
	static int countNot = 0;
87
	static int countYes = 0;
88
	
89
	private void parseOccurenceString(String occString, DescriptionBase desc){
90
		System.out.println(occString);
91
		if (occString != null){
92
			String[] words = pattern.split(occString);
93
			int i = 0;
94
			int countSkip = 0;
95
			for (String word: words){
96
				if (word.contains("U.S.A")){
97
					logger.warn("U.S.A.");
98
				}
99
				boolean isDoubtful = false;
100
				if (countSkip > 0){
101
					countSkip--;
102
				}else if(word.trim().length() == 0){
103
					//skip
104
				}else{
105
					if (word.endsWith(":") && word.length()<=4){
106
						//Higher area
107
						//TODO
108
					}else{
109
						word = word.trim();
110
						if (word.contains("?")){
111
							isDoubtful = true;
112
							word = word.replace("?", "");
113
						}
114
						word = adaptWordsToTdwg(word);
115
						
116
						if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
117
							for (countSkip = 1; countSkip <= 6; countSkip++){
118
								word = word.trim();
119
								if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
120
									if (words.length > i + countSkip){
121
										word = word + " " + words[i + countSkip];
122
									}
123
									if (word.contains("?")){
124
										isDoubtful = true;
125
										word = word.replace("?", "");
126
									}
127
									word = adaptWordsToTdwg(word);
128
									if ("".equals(word)){
129
										break;
130
									}
131
								}else{
132
									break;
133
								}
134
							}
135
						}
136
						if ("".equals(word)){
137
							//countSkip = countSkip;
138
						}else if (! TdwgArea.isTdwgAreaLabel(word)  && ! TdwgArea.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
139
							if (word.contains("?")){
140
								logger.warn("XXX");
141
							}
142
							countNot++;
143
							System.out.println("   False:" + countNot + ": " + word);
144
							unrekognizedStrings.add(word);
145
							countSkip = 0;
146
						}else{
147
							PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();
148
							if (isDoubleArea(word)){
149
								NamedArea[] doubleArea = getDoubleArea(word);
150
								for (NamedArea area : doubleArea){
151
									Distribution distr = Distribution.NewInstance(area, term);
152
									desc.addElement(distr);
153
								}
154
							}else{
155
								NamedArea area;
156
								if (TdwgArea.isTdwgAreaLabel(word)){
157
									area = TdwgArea.getAreaByTdwgLabel(word);
158
								}else{
159
									area = TdwgArea.getAreaByTdwgAbbreviation(word);
160
								}
161
								if (isDoubtful){
162
									term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
163
								}
164
								Distribution distr = Distribution.NewInstance(area, term);
165
								desc.addElement(distr);
166
							}
167
							countYes++;
168
							System.out.println("      True:" + countYes + ": " + word);
169
							countSkip--;
170
						}
171
					}
172
				}
173
				i++;
174
			}
175
		}
176
	}
177
	
178
	private boolean isDoubleArea(String word){
179
		if ("Canary and Madeira Is.".equalsIgnoreCase(word) || 
180
				"southern Europe".equalsIgnoreCase(word) ||
181
				"former USSR: North and Central European territory".equalsIgnoreCase(word)
182
				){
183
			return true;
184
		}else{
185
			return false;
186
		}
187
	}
188
	
189
	private NamedArea[] getDoubleArea(String word){
190
		NamedArea[] result = new NamedArea[2];
191
		if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
192
			 result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");
193
			 result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");
194
		}else if ("southern Europe".equalsIgnoreCase(word)){
195
			 result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");
196
			 result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");
197
		}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
198
			 result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");
199
			 result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");
200
		}else{
201
			logger.warn("Double area not recognized");
202
		}
203
		return result;
204
	}
205
	
206
	
207
	static List<String> stopWords = new ArrayList<String>();
208
	static List<String> unknownAreas = new ArrayList<String>();
209
	static List<String> higherAreas = new ArrayList<String>();
210
	
211
	private String adaptWordsToTdwg(String word){
212
		word = word.replace(",", "").replace(";", "");
213
		if (! word.contains("U.S.A")){
214
			word = word.replace(",", "").replace(".", "").replace(";", "");
215
		}else{
216
			word = word.replace(",", "").replace(";", "");
217
		}
218
		
219
		word = word.trim();
220
		if (word.endsWith("Is")){
221
			word = word + ".";
222
		}
223
		if (stopWords.size() == 0){
224
			initStopWords();
225
		}
226
		
227
		word = word.replace("Russia [North European territory]", "North European Russia");
228
		word = word.replace("Russia North European territory", "North European Russia");
229
		word = word.replace("Russia: North European territory", "North European Russia");
230
		word = word.replace("Russia: North European territory", "North European Russia");
231
				
232
		word = word.replace("Amber", "amber");
233
		
234
		
235
		word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
236
		//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
237
		word = word.replace("Bahama Is.", "Bahamas");
238
		word = word.replace("Comores Is.", "Comoros");
239
		word = word.replace("former Yugoslavia", "Yugoslavia");
240
		word = word.replace("former Czechoslovakia", "Czechoslovakia");
241
		word = word.replace("Rhodesia", "Zimbabwe");
242
		if (!word.contains("El Salvador")){
243
			word = word.replace("Salvador", "El Salvador");	
244
		}
245
		word = word.replace("Vera Cruz", "Veracruz");
246
		word = word.replace("Turkmenia", "Turkmenistan");
247
		word = word.replace("Québeck", "Québec");
248
		word = word.replace("Quebeck", "Québec");
249
		word = word.replace("Quebec", "Québec");
250
		//word = word.replace("Quebec", "Qu+®bec");
251
		//word = word.replace("Quebec", "Qu├®bec");
252
		
253
		word = word.replace("Gambia", "Gambia, The");
254
		word = word.replace("Mariana Is.", "Marianas");
255
		word = word.replace("Kenia", "Kenya");
256
		word = word.replace("Central Africa", "Central African Republic");
257
		word = word.replace("Canal Zone", "");
258
		//word = word.replace("Panama", "Panamá");
259
		word = word.replace("Panama", "Panamá");
260
		if (! word.contains("New South Wales")){
261
			word = word.replace("Wales", "Great Britain");
262
		}
263
		word = word.replace("Java", "Jawa");
264
		word = word.replace("former USSR: North European territory", "North European Russia");
265
		word = word.replace("former USSR: South European territory", "South European Russia");
266
		word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
267
		
268
		word = word.replace("oceanian islands", "Pacific");
269
		word = word.replace("Ussuri region", "Primorye");
270
		word = word.replace("Galapagos Is.", "Galápagos");
271
		if (! word.contains("Is.")){
272
			word = word.replace("Galapagos", "Galápagos");
273
		}
274
		
275
		//word = word.replace("Galapagos Is.", "Galápagos");
276
		if (! word.contains("Peninsular")){
277
			word = word.replace("Malaysia", "Peninsular Malaysia");
278
		}
279
		word = word.replace("Polynesic Is.", "South Solomons");
280
		
281
		word = word.replace("Usbek SSR", "Uzbekistan");
282
		word = word.replace("Mexican amber", "Mexico");
283
		word = word.replace("Marocco", "Morocco");
284
		if (! word.contains("Tobago")){
285
			word = word.replace("Trinidad", "Trinidad-Tobago");
286
		}
287
		if (! word.contains("Trinidad")){
288
			word = word.replace("Tobago", "Trinidad-Tobago");
289
		}
290
		word = word.replace("Haiti", "Haiti");  
291
		word = word.replace("Moluccas", "Maluku");
292
		word = word.replace("Belau", "Palau");
293
		word = word.replace("Dominican amber", "Dominican Republic");
294
		if (! word.contains("Russian")){
295
			word = word.replace("Far East", "Russian Far East");
296
		}
297
		word = word.replace("Tahiti", "Society Is.");
298
		word = word.replace("Iraque", "Iraq");
299
		word = word.replace("Wake Island", "Wake I.");
300
		if (! word.contains("I.")){
301
			word = word.replace("Johnston I", "Johnston I.");
302
			word = word.replace("Wake I", "Wake I.");
303
			word = word.replace("Clipperton I", "Clipperton I.");
304
		}
305
		if (! word.contains("Provinces")){
306
			word = word.replace("Cape Province", "Cape Provinces");
307
		}
308
		word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
309
		if (! word.contains("Barbuda")){
310
			word = word.replace("Antigua", "Antigua-Barbuda");
311
		}
312
		if (! word.contains("St.")){
313
			word = word.replace("St Vincent", "St.Vincent");
314
			word = word.replace("St Lucia", "St.Lucia");
315
			word = word.replace("St Helena", "St.Helena");
316
		}
317
		word = word.replace("Asia-tropical", "Asia-Tropical");
318
		word = word.replace("Society Islands", "Society Is.");
319
		word = word.replace("Virgin Islands", "Virgin Is.");
320
		word = word.replace("Canary Islands", "Canary Is.");
321
		word = word.replace("Rhode Island", "Rhode I.");
322
		
323
		
324
		word = word.replace("Rodriguez", "Rodrigues");
325
		word = word.replace("British Colombia", "British Columbia");
326
		word = word.replace("Bermudas", "Bermuda");
327
		word = word.replace("Tunesia", "Tunisia");
328
		word = word.replace("Santos São Paulo", "São Paulo");
329
		word = word.replace("Transvaal", "Northern Provinces");
330
		word = word.replace("Tucumán", "Tucuman");
331
		
332
		
333
//		unknownAreas.add("Baltic amber");  
334
//		unknownAreas.add("Arabia"); 
335
						
336
		for (String stopWord : stopWords){
337
			if (stopWord.equals(word)){
338
				System.out.println("         STOP: " + word);
339
				return "";
340
			}
341
		}
342
		for (String unknownArea : unknownAreas){
343
			if (unknownArea.equals(word)){
344
				System.out.println("         UNKNOWN: " + word);
345
				return "";
346
			}
347
		}
348
		for (String higherArea : higherAreas){
349
			if (higherArea.equals(word)){
350
				return "";
351
			}
352
		}
353
		
354
		//higher regions
355
		
356
		return word;
357
	}
358
	
359
	private void initStopWords(){
360
		stopWords.add("and");
361
		stopWords.add("Is");
362
		stopWords.add("Is.");
363
		stopWords.add("Islands");
364
		stopWords.add("Island");
365
		
366
		stopWords.add("of");
367
		stopWords.add("areas");
368
		stopWords.add("USA");
369
		stopWords.add("Australia"); //except for Australia only
370
		stopWords.add("Argentina");		
371

    
372
		//unknownAreas.add("Panama");
373
		unknownAreas.add("South Africa");
374
		unknownAreas.add("Chile");
375

    
376
		unknownAreas.add("Baltic amber");  
377
		unknownAreas.add("Arabia"); 
378

    
379
			
380
		higherAreas.add("AF");
381
		higherAreas.add("OR");
382
		higherAreas.add("PA");
383
		higherAreas.add("AU");
384
		higherAreas.add("NE");
385
		
386
		higherAreas.add("NT");
387
	}
388

    
389
	
390
	/**
391
	 * @param args
392
	 */
393
	public static void main(String[] args) {
394
		ICdmDataSource cdmDestination = CdmDestinations.localH2();
395
		CdmApplicationController app = null;
396
		try {
397
			DbSchemaValidation val = DbSchemaValidation.UPDATE;
398
			app = CdmApplicationController.NewInstance(cdmDestination, val);
399
		} catch (DataSourceNotFoundException e) {
400
			e.printStackTrace();
401
		} catch (TermNotFoundException e) {
402
			e.printStackTrace();
403
		}
404
		DipteraDistributionParser dipDist = new DipteraDistributionParser();
405
		if (app != null){
406
			dipDist.doDistribution(app);
407
		}else{
408
			logger.warn("No Application Context");
409
		}
410
	}
411
}
(5-5/9)