Project

General

Profile

« Previous | Next » 

Revision 275ada95

Added by Andreas Müller almost 9 years ago

Remove local diptera db from destinations

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/app/wp6/diptera/DipteraCollectionImport.java
1
// $Id$
2
/**
3
* Copyright (C) 2007 EDIT
4
* European Distributed Institute of Taxonomy 
5
* http://www.e-taxonomy.eu
6
* 
7
* The contents of this file are subject to the Mozilla Public License Version 1.1
8
* See LICENSE.TXT at the top of this package for the full license terms.
9
*/
10
package eu.etaxonomy.cdm.app.wp6.diptera;
11

  
12
import java.io.File;
13
import java.io.FileInputStream;
14
import java.io.InputStream;
15
import java.io.InputStreamReader;
16
import java.util.ArrayList;
17
import java.util.HashMap;
18
import java.util.List;
19
import java.util.Map;
20

  
21
import org.apache.commons.lang.StringUtils;
22
import org.apache.log4j.Logger;
23
import org.springframework.transaction.TransactionStatus;
24

  
25
import au.com.bytecode.opencsv.CSVReader;
26
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
27
import eu.etaxonomy.cdm.app.common.CdmDestinations;
28
import eu.etaxonomy.cdm.common.CdmUtils;
29
import eu.etaxonomy.cdm.database.DbSchemaValidation;
30
import eu.etaxonomy.cdm.database.ICdmDataSource;
31
import eu.etaxonomy.cdm.model.agent.Institution;
32
import eu.etaxonomy.cdm.model.occurrence.Collection;
33
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
34
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
35

  
36
/**
37
 * @author a.mueller
38
 * @date 07.04.2010
39
 *
40
 */
41
public class DipteraCollectionImport {
42
	private static final Logger logger = Logger.getLogger(DipteraCollectionImport.class);
43

  
44
	public static final File acronymsFile = new File("src/main/resources/collections/Acronyms.tab");
45
	//datasource for use from local main()
46
	static final ICdmDataSource cdmDestination = CdmDestinations.cdm_local_diptera();
47
	
48

  
49
	public boolean invoke(ICdmDataSource dataSource) {
50
		CdmApplicationController cdmApp = CdmApplicationController.NewInstance(dataSource, DbSchemaValidation.VALIDATE);
51
			
52
		//create collections
53
		TransactionStatus tx = cdmApp.startTransaction();
54
		Map<String, Collection> colletionMap = createCollections(cdmApp);
55
		
56
		//add collections to specimen
57
		addCollectionsToSpecimen(cdmApp, colletionMap);
58
		cdmApp.commitTransaction(tx);
59
		
60
		return true;
61
		
62
	}
63

  
64

  
65
	/**
66
	 * @param cdmApp
67
	 * @param colletionMap 
68
	 */
69
	private void addCollectionsToSpecimen(CdmApplicationController cdmApp, Map<String, Collection> colletionMap) {
70
		List<DerivedUnit> specimens = cdmApp.getOccurrenceService().list(DerivedUnit.class, null, null, null, null);
71
		for (SpecimenOrObservationBase<?> specOrObservBase : specimens){
72
			if (specOrObservBase.getRecordBasis().isPreservedSpecimen()){
73
				handleSingleSpecimen((DerivedUnit)specOrObservBase, colletionMap);
74
			}else{
75
				logger.warn("There are specimenOrObservationBase objects which are not of class Specimen. This is probably an error.");
76
			}
77
		}
78
		List<SpecimenOrObservationBase> specimenList = new ArrayList<SpecimenOrObservationBase>(specimens);
79
		cdmApp.getOccurrenceService().save(specimenList);
80
	}
81

  
82

  
83
	/**
84
	 * @param specimen 
85
	 * @param colletionMap
86
	 */
87
	private void handleSingleSpecimen(DerivedUnit specimen, Map<String, Collection> collectionMap) {
88
		String titleCache = specimen.getTitleCache();
89
		String collectionCode = getCollectionCode(titleCache);
90
		if (StringUtils.isBlank(collectionCode)){
91
			logger.warn("Collection code is empty for: " + titleCache);
92
		}else{
93
			Collection collection = collectionMap.get(collectionCode);
94
			if (collection != null){
95
				specimen.setCollection(collection);
96
			}else{
97
				logger.warn("Collection not found for code: " +  collectionCode + "; titleCache: " +  titleCache);
98
			}
99
		}
100
	}
101

  
102

  
103
	/**
104
	 * @param titleCache
105
	 * @return
106
	 */
107
	private String getCollectionCode(String titleCache) {
108
		String result = titleCache.trim();
109
		result = replaceBracket(result);
110
		result = replaceLastFullStop(result);
111
		result = replaceLastQuestionMark(result);
112
		result = parseLastUpperCase(result);
113
		return result;
114
	}
115

  
116

  
117
	/**
118
	 * @param result
119
	 * @return
120
	 */
121
	private String parseLastUpperCase(String string) {
122
		String result = "";
123
		String tmpString = string;
124
		int pos = tmpString.lastIndexOf(" ");
125
		if (pos>-1){
126
			tmpString = tmpString.substring(pos+1);
127
		}
128
		while (tmpString.length() > 0){
129
			int len = tmpString.length();
130
			char lastChar = tmpString.charAt(len-1);
131
			if (Character.isUpperCase( lastChar)){
132
				result = lastChar + result;
133
			}else{
134
				if (result.length() > 0){
135
					logger.warn("Collection code is not space separated: " + string);
136
				}
137
				break;
138
			}
139
			//remove last character
140
			tmpString = tmpString.substring(0, tmpString.length()-1);
141
		}
142
		return result;
143
	}
144

  
145

  
146

  
147
	/**
148
	 * @param result
149
	 * @return
150
	 */
151
	private String replaceLastQuestionMark(String string) {
152
		if (string.endsWith("?")){
153
			string = string.substring(0,string.length()-1).trim();
154
		}
155
		return string;
156
	}
157
	
158
	/**
159
	 * @param result
160
	 * @return
161
	 */
162
	private String replaceLastFullStop(String string) {
163
		if (string.endsWith(".")){
164
			string = string.substring(0,string.length()-1).trim();
165
		}
166
		return string;
167
	}
168

  
169

  
170
	/**
171
	 * @param result
172
	 * @return
173
	 */
174
	private String replaceBracket(String string) {
175
		if (string.endsWith("]")){
176
			int pos  = string.indexOf("[");
177
			if (pos >0){
178
				string = string.substring(0, pos).trim();
179
			}else{
180
				logger.warn("Closing bracket has no opening bracket in: " + string);
181
			}
182
		}
183
		return string;
184
	}
185

  
186

  
187
	/**
188
	 * @param cdmApp
189
	 */
190
	private Map<String, Collection> createCollections(CdmApplicationController cdmApp) {
191
		Map<String, Collection> collectionMap = new HashMap<String, Collection>(); 
192
		List<String[]> lines = getLines();
193
		for (String[] line:lines){
194
			Collection collection = makeLine(line);
195
			collectionMap.put(collection.getCode(), collection);
196
		}
197
		cdmApp.getCollectionService().save(collectionMap.values());
198
//			for (Collection collection: collectionMap.values()){
199
//				System.out.println(collection.getTitleCache());
200
//			}
201
		return collectionMap;
202
	}
203
	
204

  
205
	private Collection makeLine(String[] line) {
206
		String code = line[0];
207
		String instituteName = line[1];
208
		String lowerInstitutionName = line[2];
209
		String higherInstitutionName = line[3];
210
		String location = line[4];
211
		String country = line[5];
212
		//create objects
213
		Collection collection = Collection.NewInstance();
214
		collection.setCode(code);
215
		Institution institution = Institution.NewInstance();
216
		institution.setCode(code);
217
		
218
		institution.setName(instituteName);
219
		
220
		if (StringUtils.isNotBlank(lowerInstitutionName)){
221
			Institution lowerInstitution = Institution.NewInstance();
222
			lowerInstitution.setName(lowerInstitutionName);
223
			lowerInstitution.setIsPartOf(institution);
224
		}
225
		
226
		if (StringUtils.isNotBlank(higherInstitutionName)){
227
			Institution higherInstitution = Institution.NewInstance();
228
			higherInstitution.setName(higherInstitutionName);
229
			institution.setIsPartOf(higherInstitution);
230
		}
231
		
232
		collection.setInstitute(institution);
233
		String locationAndCountry = CdmUtils.concat("/", location, country);
234
		collection.setTownOrLocation(locationAndCountry);
235
		
236
		String titleCache = CdmUtils.concat(", ", new String[]{instituteName, lowerInstitutionName, higherInstitutionName, location, country});
237
		collection.setTitleCache(titleCache, true);
238
		
239
		return collection;
240
	}
241

  
242
	
243
	
244
	
245
	private List<String[]> getLines() {
246
		List<String[]> result = new ArrayList<String[]>();
247
		
248
		try {
249
			InputStream inStream = new FileInputStream(acronymsFile);
250
			InputStreamReader inputStreamReader = new InputStreamReader(inStream, "UTF8");
251
			CSVReader reader = new CSVReader(inputStreamReader, '\t');
252
			String [] nextLine = reader.readNext();
253
			
254
			
255
			while ((nextLine = reader.readNext()) != null) {
256
				if (nextLine.length == 0){
257
					continue;
258
				}
259
				result.add(nextLine);
260
			}
261
			return result;
262
		} catch (Exception e) {
263
			logger.error(e + " " + e.getCause() + " " + e.getMessage());
264
			for(StackTraceElement ste : e.getStackTrace()) {
265
				logger.error(ste);
266
			}
267
			throw new RuntimeException(e);
268
		}
269
	}
270

  
271

  
272

  
273

  
274

  
275
	/**
276
	 * @param args
277
	 */
278
	public static void main(String[] args) {
279
		try {
280
			DipteraCollectionImport collectionImport = new DipteraCollectionImport();
281
			collectionImport.invoke(cdmDestination);
282
//			String titleCache = "Peru. Mouth of Rio Pachitea. ST 2R SMT. [fig. of male abdomen]";
283
//			String collectionCode = collectionImport.getCollectionCode(titleCache);
284
//			System.out.println(collectionCode);
285
		} catch (Exception e) {
286
			e.printStackTrace();
287
			System.exit(-1);
288
		}
289
	}
290

  
291
}
1
// $Id$
2
/**
3
* Copyright (C) 2007 EDIT
4
* European Distributed Institute of Taxonomy
5
* http://www.e-taxonomy.eu
6
*
7
* The contents of this file are subject to the Mozilla Public License Version 1.1
8
* See LICENSE.TXT at the top of this package for the full license terms.
9
*/
10
package eu.etaxonomy.cdm.app.wp6.diptera;
11

  
12
import java.io.File;
13
import java.io.FileInputStream;
14
import java.io.InputStream;
15
import java.io.InputStreamReader;
16
import java.util.ArrayList;
17
import java.util.HashMap;
18
import java.util.List;
19
import java.util.Map;
20

  
21
import org.apache.commons.lang.StringUtils;
22
import org.apache.log4j.Logger;
23
import org.springframework.transaction.TransactionStatus;
24

  
25
import au.com.bytecode.opencsv.CSVReader;
26
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
27
import eu.etaxonomy.cdm.app.common.CdmDestinations;
28
import eu.etaxonomy.cdm.common.CdmUtils;
29
import eu.etaxonomy.cdm.database.DbSchemaValidation;
30
import eu.etaxonomy.cdm.database.ICdmDataSource;
31
import eu.etaxonomy.cdm.model.agent.Institution;
32
import eu.etaxonomy.cdm.model.occurrence.Collection;
33
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
34
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
35

  
36
/**
37
 * @author a.mueller
38
 * @date 07.04.2010
39
 *
40
 */
41
public class DipteraCollectionImport {
42
	private static final Logger logger = Logger.getLogger(DipteraCollectionImport.class);
43

  
44
	public static final File acronymsFile = new File("src/main/resources/collections/Acronyms.tab");
45
	//datasource for use from local main()
46
	static final ICdmDataSource cdmDestination = CdmDestinations.localH2();
47

  
48

  
49
	public boolean invoke(ICdmDataSource dataSource) {
50
		CdmApplicationController cdmApp = CdmApplicationController.NewInstance(dataSource, DbSchemaValidation.VALIDATE);
51

  
52
		//create collections
53
		TransactionStatus tx = cdmApp.startTransaction();
54
		Map<String, Collection> colletionMap = createCollections(cdmApp);
55

  
56
		//add collections to specimen
57
		addCollectionsToSpecimen(cdmApp, colletionMap);
58
		cdmApp.commitTransaction(tx);
59

  
60
		return true;
61

  
62
	}
63

  
64

  
65
	/**
66
	 * @param cdmApp
67
	 * @param colletionMap
68
	 */
69
	private void addCollectionsToSpecimen(CdmApplicationController cdmApp, Map<String, Collection> colletionMap) {
70
		List<DerivedUnit> specimens = cdmApp.getOccurrenceService().list(DerivedUnit.class, null, null, null, null);
71
		for (SpecimenOrObservationBase<?> specOrObservBase : specimens){
72
			if (specOrObservBase.getRecordBasis().isPreservedSpecimen()){
73
				handleSingleSpecimen((DerivedUnit)specOrObservBase, colletionMap);
74
			}else{
75
				logger.warn("There are specimenOrObservationBase objects which are not of class Specimen. This is probably an error.");
76
			}
77
		}
78
		List<SpecimenOrObservationBase> specimenList = new ArrayList<SpecimenOrObservationBase>(specimens);
79
		cdmApp.getOccurrenceService().save(specimenList);
80
	}
81

  
82

  
83
	/**
84
	 * @param specimen
85
	 * @param colletionMap
86
	 */
87
	private void handleSingleSpecimen(DerivedUnit specimen, Map<String, Collection> collectionMap) {
88
		String titleCache = specimen.getTitleCache();
89
		String collectionCode = getCollectionCode(titleCache);
90
		if (StringUtils.isBlank(collectionCode)){
91
			logger.warn("Collection code is empty for: " + titleCache);
92
		}else{
93
			Collection collection = collectionMap.get(collectionCode);
94
			if (collection != null){
95
				specimen.setCollection(collection);
96
			}else{
97
				logger.warn("Collection not found for code: " +  collectionCode + "; titleCache: " +  titleCache);
98
			}
99
		}
100
	}
101

  
102

  
103
	/**
104
	 * @param titleCache
105
	 * @return
106
	 */
107
	private String getCollectionCode(String titleCache) {
108
		String result = titleCache.trim();
109
		result = replaceBracket(result);
110
		result = replaceLastFullStop(result);
111
		result = replaceLastQuestionMark(result);
112
		result = parseLastUpperCase(result);
113
		return result;
114
	}
115

  
116

  
117
	/**
118
	 * @param result
119
	 * @return
120
	 */
121
	private String parseLastUpperCase(String string) {
122
		String result = "";
123
		String tmpString = string;
124
		int pos = tmpString.lastIndexOf(" ");
125
		if (pos>-1){
126
			tmpString = tmpString.substring(pos+1);
127
		}
128
		while (tmpString.length() > 0){
129
			int len = tmpString.length();
130
			char lastChar = tmpString.charAt(len-1);
131
			if (Character.isUpperCase( lastChar)){
132
				result = lastChar + result;
133
			}else{
134
				if (result.length() > 0){
135
					logger.warn("Collection code is not space separated: " + string);
136
				}
137
				break;
138
			}
139
			//remove last character
140
			tmpString = tmpString.substring(0, tmpString.length()-1);
141
		}
142
		return result;
143
	}
144

  
145

  
146

  
147
	/**
148
	 * @param result
149
	 * @return
150
	 */
151
	private String replaceLastQuestionMark(String string) {
152
		if (string.endsWith("?")){
153
			string = string.substring(0,string.length()-1).trim();
154
		}
155
		return string;
156
	}
157

  
158
	/**
159
	 * @param result
160
	 * @return
161
	 */
162
	private String replaceLastFullStop(String string) {
163
		if (string.endsWith(".")){
164
			string = string.substring(0,string.length()-1).trim();
165
		}
166
		return string;
167
	}
168

  
169

  
170
	/**
171
	 * @param result
172
	 * @return
173
	 */
174
	private String replaceBracket(String string) {
175
		if (string.endsWith("]")){
176
			int pos  = string.indexOf("[");
177
			if (pos >0){
178
				string = string.substring(0, pos).trim();
179
			}else{
180
				logger.warn("Closing bracket has no opening bracket in: " + string);
181
			}
182
		}
183
		return string;
184
	}
185

  
186

  
187
	/**
188
	 * @param cdmApp
189
	 */
190
	private Map<String, Collection> createCollections(CdmApplicationController cdmApp) {
191
		Map<String, Collection> collectionMap = new HashMap<String, Collection>();
192
		List<String[]> lines = getLines();
193
		for (String[] line:lines){
194
			Collection collection = makeLine(line);
195
			collectionMap.put(collection.getCode(), collection);
196
		}
197
		cdmApp.getCollectionService().save(collectionMap.values());
198
//			for (Collection collection: collectionMap.values()){
199
//				System.out.println(collection.getTitleCache());
200
//			}
201
		return collectionMap;
202
	}
203

  
204

  
205
	private Collection makeLine(String[] line) {
206
		String code = line[0];
207
		String instituteName = line[1];
208
		String lowerInstitutionName = line[2];
209
		String higherInstitutionName = line[3];
210
		String location = line[4];
211
		String country = line[5];
212
		//create objects
213
		Collection collection = Collection.NewInstance();
214
		collection.setCode(code);
215
		Institution institution = Institution.NewInstance();
216
		institution.setCode(code);
217

  
218
		institution.setName(instituteName);
219

  
220
		if (StringUtils.isNotBlank(lowerInstitutionName)){
221
			Institution lowerInstitution = Institution.NewInstance();
222
			lowerInstitution.setName(lowerInstitutionName);
223
			lowerInstitution.setIsPartOf(institution);
224
		}
225

  
226
		if (StringUtils.isNotBlank(higherInstitutionName)){
227
			Institution higherInstitution = Institution.NewInstance();
228
			higherInstitution.setName(higherInstitutionName);
229
			institution.setIsPartOf(higherInstitution);
230
		}
231

  
232
		collection.setInstitute(institution);
233
		String locationAndCountry = CdmUtils.concat("/", location, country);
234
		collection.setTownOrLocation(locationAndCountry);
235

  
236
		String titleCache = CdmUtils.concat(", ", new String[]{instituteName, lowerInstitutionName, higherInstitutionName, location, country});
237
		collection.setTitleCache(titleCache, true);
238

  
239
		return collection;
240
	}
241

  
242

  
243

  
244

  
245
	private List<String[]> getLines() {
246
		List<String[]> result = new ArrayList<String[]>();
247

  
248
		try {
249
			InputStream inStream = new FileInputStream(acronymsFile);
250
			InputStreamReader inputStreamReader = new InputStreamReader(inStream, "UTF8");
251
			CSVReader reader = new CSVReader(inputStreamReader, '\t');
252
			String [] nextLine = reader.readNext();
253

  
254

  
255
			while ((nextLine = reader.readNext()) != null) {
256
				if (nextLine.length == 0){
257
					continue;
258
				}
259
				result.add(nextLine);
260
			}
261
			return result;
262
		} catch (Exception e) {
263
			logger.error(e + " " + e.getCause() + " " + e.getMessage());
264
			for(StackTraceElement ste : e.getStackTrace()) {
265
				logger.error(ste);
266
			}
267
			throw new RuntimeException(e);
268
		}
269
	}
270

  
271

  
272

  
273

  
274

  
275
	/**
276
	 * @param args
277
	 */
278
	public static void main(String[] args) {
279
		try {
280
			DipteraCollectionImport collectionImport = new DipteraCollectionImport();
281
			collectionImport.invoke(cdmDestination);
282
//			String titleCache = "Peru. Mouth of Rio Pachitea. ST 2R SMT. [fig. of male abdomen]";
283
//			String collectionCode = collectionImport.getCollectionCode(titleCache);
284
//			System.out.println(collectionCode);
285
		} catch (Exception e) {
286
			e.printStackTrace();
287
			System.exit(-1);
288
		}
289
	}
290

  
291
}
app-import/src/main/java/eu/etaxonomy/cdm/app/wp6/diptera/DipteraDistributionParser.java
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy 
4
* http://www.e-taxonomy.eu
5
* 
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

  
10
/**
11
* Copyright (C) 2007 EDIT
12
* European Distributed Institute of Taxonomy 
13
* http://www.e-taxonomy.eu
14
* 
15
* The contents of this file are subject to the Mozilla Public License Version 1.1
16
* See LICENSE.TXT at the top of this package for the full license terms.
17
*/
18
package eu.etaxonomy.cdm.app.wp6.diptera;
19

  
20
import java.util.ArrayList;
21
import java.util.HashSet;
22
import java.util.List;
23
import java.util.Set;
24
import java.util.regex.Pattern;
25

  
26
import org.apache.log4j.Logger;
27
import org.springframework.transaction.TransactionStatus;
28

  
29
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration;
31
import eu.etaxonomy.cdm.app.common.CdmDestinations;
32
import eu.etaxonomy.cdm.database.DbSchemaValidation;
33
import eu.etaxonomy.cdm.database.ICdmDataSource;
34
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
35
import eu.etaxonomy.cdm.model.common.Language;
36
import eu.etaxonomy.cdm.model.description.DescriptionBase;
37
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38
import eu.etaxonomy.cdm.model.description.Distribution;
39
import eu.etaxonomy.cdm.model.description.Feature;
40
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
41
import eu.etaxonomy.cdm.model.description.TaxonDescription;
42
import eu.etaxonomy.cdm.model.description.TextData;
43
import eu.etaxonomy.cdm.model.location.NamedArea;
44
import eu.etaxonomy.cdm.model.taxon.Taxon;
45
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46

  
47
/**
48
 * @author a.mueller
49
 * @created 17.10.2008
50
 * @version 1.0
51
 */
52
public class DipteraDistributionParser {
53
	private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
54
	
55
	private static ICdmDataSource cdmDestination = CdmDestinations.cdm_local_diptera();
56

  
57
	final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
58
	static Pattern pattern = null;
59
	
60
	protected void doDistribution(ICdmApplicationConfiguration app){
61
		pattern = Pattern.compile(epiSplitter); 
62
	    TransactionStatus txStatus = app.startTransaction();
63
		List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
64
		for (TaxonBase taxon: taxa ){
65
			if (taxon instanceof Taxon){
66
		//		unlazyDescription(app, (Taxon)taxon);
67
				Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
68
				for (DescriptionBase description: descriptions){
69
					Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
70
					descElements.addAll(description.getElements());
71
					
72
					for (DescriptionElementBase descEl: descElements){
73
						if (descEl.getFeature().equals(Feature.OCCURRENCE())){
74
							if (descEl instanceof TextData){
75
								String occString = ((TextData)descEl).getText(Language.ENGLISH());
76
								parseOccurenceString(occString, description);
77
								//app.getTaxonService().saveTaxon(taxon);
78
							}
79
						}
80
					}
81
				}
82
			}
83
		}
84
		System.out.println("Unknowns: ");
85
		for (String unknown: unrekognizedStrings){
86
			System.out.println(unknown);
87
		}
88
		System.out.println("Distributions not recognized: " + countNot);
89
		System.out.println("Distributions created: " + countYes);
90
		app.commitTransaction(txStatus);
91
	}
92
	
93
	static Set<String> unrekognizedStrings = new HashSet<String>();
94
	static int countNot = 0;
95
	static int countYes = 0;
96
	
97
	private void parseOccurenceString(String occString, DescriptionBase desc){
98
		System.out.println(occString);
99
		if (occString != null){
100
			String[] words = pattern.split(occString);
101
			int i = 0;
102
			int countSkip = 0;
103
			for (String word: words){
104
				if (word.contains("U.S.A")){
105
					logger.warn("U.S.A.");
106
				}
107
				boolean isDoubtful = false;
108
				if (countSkip > 0){
109
					countSkip--;
110
				}else if(word.trim().length() == 0){
111
					//skip
112
				}else{
113
					if (word.endsWith(":") && word.length()<=4){
114
						//Higher area
115
						//TODO
116
					}else{
117
						word = word.trim();
118
						if (word.contains("?")){
119
							isDoubtful = true;
120
							word = word.replace("?", "");
121
						}
122
						word = adaptWordsToTdwg(word);
123
						
124
						if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
125
							for (countSkip = 1; countSkip <= 6; countSkip++){
126
								word = word.trim();
127
								if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
128
									if (words.length > i + countSkip){
129
										word = word + " " + words[i + countSkip];
130
									}
131
									if (word.contains("?")){
132
										isDoubtful = true;
133
										word = word.replace("?", "");
134
									}
135
									word = adaptWordsToTdwg(word);
136
									if ("".equals(word)){
137
										break;
138
									}
139
								}else{
140
									break;
141
								}
142
							}
143
						}
144
						if ("".equals(word)){
145
							//countSkip = countSkip;
146
						}else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
147
							if (word.contains("?")){
148
								logger.warn("XXX");
149
							}
150
							countNot++;
151
							System.out.println("   False:" + countNot + ": " + word);
152
							unrekognizedStrings.add(word);
153
							countSkip = 0;
154
						}else{
155
							if (word.equals("Netherlands")){
156
								if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
157
									word = "Netherlands Antilles";
158
									countSkip=2;
159
								}
160
							}
161
							PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
162
							if (isDoubleArea(word)){
163
								NamedArea[] doubleArea = getDoubleArea(word);
164
								for (NamedArea area : doubleArea){
165
									Distribution distr = Distribution.NewInstance(area, term);
166
									desc.addElement(distr);
167
								}
168
							}else{
169
								NamedArea area;
170
								if (TdwgAreaProvider.isTdwgAreaLabel(word)){
171
									area = TdwgAreaProvider.getAreaByTdwgLabel(word);
172
								}else{
173
									area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
174
								}
175
								if (isDoubtful){
176
									term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
177
								}
178
								Distribution distr = Distribution.NewInstance(area, term);
179
								desc.addElement(distr);
180
							}
181
							countYes++;
182
							System.out.println("      True:" + countYes + ": " + word);
183
							countSkip--;
184
						}
185
					}
186
				}
187
				i++;
188
			}
189
		}
190
	}
191
	
192
	private boolean isDoubleArea(String word){
193
		if ("Canary and Madeira Is.".equalsIgnoreCase(word) || 
194
				"southern Europe".equalsIgnoreCase(word) ||
195
				"former USSR: North and Central European territory".equalsIgnoreCase(word)
196
				){
197
			return true;
198
		}else{
199
			return false;
200
		}
201
	}
202
	
203
	private NamedArea[] getDoubleArea(String word){
204
		NamedArea[] result = new NamedArea[2];
205
		if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
206
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
207
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
208
		}else if ("southern Europe".equalsIgnoreCase(word)){
209
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
210
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
211
		}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
212
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
213
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
214
		}else{
215
			logger.warn("Double area not recognized");
216
		}
217
		return result;
218
	}
219
	
220
	
221
	static List<String> stopWords = new ArrayList<String>();
222
	static List<String> unknownAreas = new ArrayList<String>();
223
	static List<String> higherAreas = new ArrayList<String>();
224
	
225
	private String adaptWordsToTdwg(String word){
226
		word = word.replace(",", "").replace(";", "");
227
		if (! word.contains("U.S.A")){
228
			word = word.replace(",", "").replace(".", "").replace(";", "");
229
		}else{
230
			word = word.replace(",", "").replace(";", "");
231
		}
232
		
233
		word = word.trim();
234
		if (word.endsWith("Is")){
235
			word = word + ".";
236
		}
237
		if (stopWords.size() == 0){
238
			initStopWords();
239
		}
240
		
241
		word = word.replace("Russia [North European territory]", "North European Russia");
242
		word = word.replace("Russia North European territory", "North European Russia");
243
		word = word.replace("Russia: North European territory", "North European Russia");
244
		word = word.replace("Russia: North European territory", "North European Russia");
245
				
246
		word = word.replace("Amber", "amber");
247
		
248
		
249
		word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
250
		//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
251
		word = word.replace("Bahama Is.", "Bahamas");
252
		word = word.replace("Comores Is.", "Comoros");
253
		word = word.replace("former Yugoslavia", "Yugoslavia");
254
		word = word.replace("former Czechoslovakia", "Czechoslovakia");
255
		word = word.replace("Rhodesia", "Zimbabwe");
256
		word = word.replace("The Gambia", "Gambia, The");
257

  
258
		if (!word.contains("El Salvador")){
259
			word = word.replace("Salvador", "El Salvador");	
260
		}
261
		word = word.replace("Vera Cruz", "Veracruz");
262
		word = word.replace("Turkmenia", "Turkmenistan");
263
		word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
264
		word = word.replace("Quebeck", "Qu\u00E9bec");
265
		word = word.replace("Quebec", "Qu\u00E9bec");
266
		
267
		if (!word.contains("Gambia, The")){
268
			word = word.replace("Gambia", "Gambia, The");
269
		}
270
		word = word.replace("Mariana Is.", "Marianas");
271
		word = word.replace("Kenia", "Kenya");
272
		word = word.replace("Central Africa", "Central African Republic");
273
		word = word.replace("Canal Zone", "");
274
		//word = word.replace("Panama", "Panamá");
275
		word = word.replace("Panama", "Panam\u00E1");
276
		if (! word.contains("New South Wales")){
277
			word = word.replace("Wales", "Great Britain");
278
		}
279
		word = word.replace("Java", "Jawa");
280
		word = word.replace("former USSR: North European territory", "North European Russia");
281
		word = word.replace("former USSR: South European territory", "South European Russia");
282
		word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
283
		
284
		word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
285
		
286
		word = word.replace("oceanian islands", "Pacific");
287
		word = word.replace("Ussuri region", "Primorye");
288
		word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
289
		word = word.replace("Tarapac\u00E1", "Tarapaca");
290
		word = word.replace("Reunion", "R\u00E9union");
291
		if (! word.contains("Is.")){
292
			word = word.replace("Galapagos", "Gal\u00E1pagos");
293
		}
294
		
295
		//word = word.replace("Galapagos Is.", "Galápagos");
296
		if (! word.contains("Peninsular")){
297
			word = word.replace("Malaysia", "Peninsular Malaysia");
298
		}
299
		word = word.replace("Polynesic Is.", "South Solomons");
300
		
301
		word = word.replace("Usbek SSR", "Uzbekistan");
302
		word = word.replace("Mexican amber", "Mexico");
303
		word = word.replace("Marocco", "Morocco");
304
		if (! word.contains("Tobago")){
305
			word = word.replace("Trinidad", "Trinidad-Tobago");
306
		}
307
		if (! word.contains("Trinidad")){
308
			word = word.replace("Tobago", "Trinidad-Tobago");
309
		}
310
		word = word.replace("Haiti", "Haiti");  
311
		word = word.replace("Moluccas", "Maluku");
312
		word = word.replace("Belau", "Palau");
313
		word = word.replace("Dominican amber", "Dominican Republic");
314
		if (! word.contains("Russian")){
315
			word = word.replace("Far East", "Russian Far East");
316
		}
317
		word = word.replace("Tahiti", "Society Is.");
318
		word = word.replace("Iraque", "Iraq");
319
		word = word.replace("Wake Island", "Wake I.");
320
		if (! word.contains("I.")){
321
			word = word.replace("Johnston I", "Johnston I.");
322
			word = word.replace("Wake I", "Wake I.");
323
			word = word.replace("Clipperton I", "Clipperton I.");
324
		}
325
		if (! word.contains("Provinces")){
326
			word = word.replace("Cape Province", "Cape Provinces");
327
		}
328
		word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
329
		word = word.replace("Western Cape Provinces", "Western Cape Province");
330
		if (! word.contains("Barbuda")){
331
			word = word.replace("Antigua", "Antigua-Barbuda");
332
		}
333
		if (! word.contains("St.")){
334
			word = word.replace("St Vincent", "St.Vincent");
335
			word = word.replace("St Lucia", "St.Lucia");
336
			word = word.replace("St Helena", "St.Helena");
337
		}
338
		word = word.replace("Asia-tropical", "Asia-Tropical");
339
		word = word.replace("Society Islands", "Society Is.");
340
		word = word.replace("Virgin Islands", "Virgin Is.");
341
		word = word.replace("Canary Islands", "Canary Is.");
342
		word = word.replace("Rhode Island", "Rhode I.");
343
		
344
		
345
		word = word.replace("Rodriguez", "Rodrigues");
346
		word = word.replace("British Colombia", "British Columbia");
347
		word = word.replace("Bermudas", "Bermuda");
348
		word = word.replace("Tunesia", "Tunisia");
349
		word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
350
		word = word.replace("Transvaal", "Northern Provinces");
351
		word = word.replace("Tucum\u00E1n", "Tucuman");
352
//		if (!word.contains("Netherlands")){
353
//			
354
//		}
355
		
356
//		unknownAreas.add("Baltic amber");  
357
//		unknownAreas.add("Arabia"); 
358
						
359
		for (String stopWord : stopWords){
360
			if (stopWord.equals(word)){
361
				System.out.println("         STOP: " + word);
362
				return "";
363
			}
364
		}
365
		for (String unknownArea : unknownAreas){
366
			if (unknownArea.equals(word)){
367
				System.out.println("         UNKNOWN: " + word);
368
				return "";
369
			}
370
		}
371
		for (String higherArea : higherAreas){
372
			if (higherArea.equals(word)){
373
				return "";
374
			}
375
		}
376
		
377
		//higher regions
378
		
379
		return word;
380
	}
381
	
382
	private void initStopWords(){
383
		stopWords.add("and");
384
		stopWords.add("Is");
385
		stopWords.add("Is.");
386
		stopWords.add("Islands");
387
		stopWords.add("Island");
388
		
389
		stopWords.add("of");
390
		stopWords.add("areas");
391
		stopWords.add("USA");
392
		stopWords.add("Australia"); //except for Australia only
393
		stopWords.add("Argentina");		
394

  
395
		//unknownAreas.add("Panama");
396
		unknownAreas.add("South Africa");
397
		unknownAreas.add("Chile");
398

  
399
		unknownAreas.add("Baltic amber");  
400
		unknownAreas.add("Arabia"); 
401

  
402
			
403
		higherAreas.add("AF");
404
		higherAreas.add("OR");
405
		higherAreas.add("PA");
406
		higherAreas.add("AU");
407
		higherAreas.add("NE");
408
		
409
		higherAreas.add("NT");
410
	}
411

  
412
	
413
	/**
414
	 * @param args
415
	 */
416
	public static void main(String[] args) {
417
		CdmApplicationController app = null;
418
		DbSchemaValidation val = DbSchemaValidation.UPDATE;
419
		app = CdmApplicationController.NewInstance(cdmDestination, val);
420
		
421
		DipteraDistributionParser dipDist = new DipteraDistributionParser();
422
		if (app != null){
423
			dipDist.doDistribution(app);
424
		}else{
425
			logger.warn("No Application Context");
426
		}
427
	}
428
}
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

  
10
/**
11
* Copyright (C) 2007 EDIT
12
* European Distributed Institute of Taxonomy
13
* http://www.e-taxonomy.eu
14
*
15
* The contents of this file are subject to the Mozilla Public License Version 1.1
16
* See LICENSE.TXT at the top of this package for the full license terms.
17
*/
18
package eu.etaxonomy.cdm.app.wp6.diptera;
19

  
20
import java.util.ArrayList;
21
import java.util.HashSet;
22
import java.util.List;
23
import java.util.Set;
24
import java.util.regex.Pattern;
25

  
26
import org.apache.log4j.Logger;
27
import org.springframework.transaction.TransactionStatus;
28

  
29
import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration;
31
import eu.etaxonomy.cdm.app.common.CdmDestinations;
32
import eu.etaxonomy.cdm.database.DbSchemaValidation;
33
import eu.etaxonomy.cdm.database.ICdmDataSource;
34
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
35
import eu.etaxonomy.cdm.model.common.Language;
36
import eu.etaxonomy.cdm.model.description.DescriptionBase;
37
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38
import eu.etaxonomy.cdm.model.description.Distribution;
39
import eu.etaxonomy.cdm.model.description.Feature;
40
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
41
import eu.etaxonomy.cdm.model.description.TaxonDescription;
42
import eu.etaxonomy.cdm.model.description.TextData;
43
import eu.etaxonomy.cdm.model.location.NamedArea;
44
import eu.etaxonomy.cdm.model.taxon.Taxon;
45
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46

  
47
/**
48
 * @author a.mueller
49
 * @created 17.10.2008
50
 * @version 1.0
51
 */
52
public class DipteraDistributionParser {
53
	private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
54

  
55
	private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
56

  
57
	final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
58
	static Pattern pattern = null;
59

  
60
	protected void doDistribution(ICdmApplicationConfiguration app){
61
		pattern = Pattern.compile(epiSplitter);
62
	    TransactionStatus txStatus = app.startTransaction();
63
		List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
64
		for (TaxonBase taxon: taxa ){
65
			if (taxon instanceof Taxon){
66
		//		unlazyDescription(app, (Taxon)taxon);
67
				Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
68
				for (DescriptionBase description: descriptions){
69
					Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
70
					descElements.addAll(description.getElements());
71

  
72
					for (DescriptionElementBase descEl: descElements){
73
						if (descEl.getFeature().equals(Feature.OCCURRENCE())){
74
							if (descEl instanceof TextData){
75
								String occString = ((TextData)descEl).getText(Language.ENGLISH());
76
								parseOccurenceString(occString, description);
77
								//app.getTaxonService().saveTaxon(taxon);
78
							}
79
						}
80
					}
81
				}
82
			}
83
		}
84
		System.out.println("Unknowns: ");
85
		for (String unknown: unrekognizedStrings){
86
			System.out.println(unknown);
87
		}
88
		System.out.println("Distributions not recognized: " + countNot);
89
		System.out.println("Distributions created: " + countYes);
90
		app.commitTransaction(txStatus);
91
	}
92

  
93
	static Set<String> unrekognizedStrings = new HashSet<String>();
94
	static int countNot = 0;
95
	static int countYes = 0;
96

  
97
	private void parseOccurenceString(String occString, DescriptionBase desc){
98
		System.out.println(occString);
99
		if (occString != null){
100
			String[] words = pattern.split(occString);
101
			int i = 0;
102
			int countSkip = 0;
103
			for (String word: words){
104
				if (word.contains("U.S.A")){
105
					logger.warn("U.S.A.");
106
				}
107
				boolean isDoubtful = false;
108
				if (countSkip > 0){
109
					countSkip--;
110
				}else if(word.trim().length() == 0){
111
					//skip
112
				}else{
113
					if (word.endsWith(":") && word.length()<=4){
114
						//Higher area
115
						//TODO
116
					}else{
117
						word = word.trim();
118
						if (word.contains("?")){
119
							isDoubtful = true;
120
							word = word.replace("?", "");
121
						}
122
						word = adaptWordsToTdwg(word);
123

  
124
						if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
125
							for (countSkip = 1; countSkip <= 6; countSkip++){
126
								word = word.trim();
127
								if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
128
									if (words.length > i + countSkip){
129
										word = word + " " + words[i + countSkip];
130
									}
131
									if (word.contains("?")){
132
										isDoubtful = true;
133
										word = word.replace("?", "");
134
									}
135
									word = adaptWordsToTdwg(word);
136
									if ("".equals(word)){
137
										break;
138
									}
139
								}else{
140
									break;
141
								}
142
							}
143
						}
144
						if ("".equals(word)){
145
							//countSkip = countSkip;
146
						}else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
147
							if (word.contains("?")){
148
								logger.warn("XXX");
149
							}
150
							countNot++;
151
							System.out.println("   False:" + countNot + ": " + word);
152
							unrekognizedStrings.add(word);
153
							countSkip = 0;
154
						}else{
155
							if (word.equals("Netherlands")){
156
								if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
157
									word = "Netherlands Antilles";
158
									countSkip=2;
159
								}
160
							}
161
							PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
162
							if (isDoubleArea(word)){
163
								NamedArea[] doubleArea = getDoubleArea(word);
164
								for (NamedArea area : doubleArea){
165
									Distribution distr = Distribution.NewInstance(area, term);
166
									desc.addElement(distr);
167
								}
168
							}else{
169
								NamedArea area;
170
								if (TdwgAreaProvider.isTdwgAreaLabel(word)){
171
									area = TdwgAreaProvider.getAreaByTdwgLabel(word);
172
								}else{
173
									area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
174
								}
175
								if (isDoubtful){
176
									term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
177
								}
178
								Distribution distr = Distribution.NewInstance(area, term);
179
								desc.addElement(distr);
180
							}
181
							countYes++;
182
							System.out.println("      True:" + countYes + ": " + word);
183
							countSkip--;
184
						}
185
					}
186
				}
187
				i++;
188
			}
189
		}
190
	}
191

  
192
	private boolean isDoubleArea(String word){
193
		if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
194
				"southern Europe".equalsIgnoreCase(word) ||
195
				"former USSR: North and Central European territory".equalsIgnoreCase(word)
196
				){
197
			return true;
198
		}else{
199
			return false;
200
		}
201
	}
202

  
203
	private NamedArea[] getDoubleArea(String word){
204
		NamedArea[] result = new NamedArea[2];
205
		if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
206
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
207
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
208
		}else if ("southern Europe".equalsIgnoreCase(word)){
209
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
210
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
211
		}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
212
			 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
213
			 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
214
		}else{
215
			logger.warn("Double area not recognized");
216
		}
217
		return result;
218
	}
219

  
220

  
221
	static List<String> stopWords = new ArrayList<String>();
222
	static List<String> unknownAreas = new ArrayList<String>();
223
	static List<String> higherAreas = new ArrayList<String>();
224

  
225
	private String adaptWordsToTdwg(String word){
226
		word = word.replace(",", "").replace(";", "");
227
		if (! word.contains("U.S.A")){
228
			word = word.replace(",", "").replace(".", "").replace(";", "");
229
		}else{
230
			word = word.replace(",", "").replace(";", "");
231
		}
232

  
233
		word = word.trim();
234
		if (word.endsWith("Is")){
235
			word = word + ".";
236
		}
237
		if (stopWords.size() == 0){
238
			initStopWords();
239
		}
240

  
241
		word = word.replace("Russia [North European territory]", "North European Russia");
242
		word = word.replace("Russia North European territory", "North European Russia");
243
		word = word.replace("Russia: North European territory", "North European Russia");
244
		word = word.replace("Russia: North European territory", "North European Russia");
245

  
246
		word = word.replace("Amber", "amber");
247

  
248

  
249
		word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
250
		//or word = word.replace("Prince Edward Is.", "Prince Edward I.");
251
		word = word.replace("Bahama Is.", "Bahamas");
252
		word = word.replace("Comores Is.", "Comoros");
253
		word = word.replace("former Yugoslavia", "Yugoslavia");
254
		word = word.replace("former Czechoslovakia", "Czechoslovakia");
255
		word = word.replace("Rhodesia", "Zimbabwe");
256
		word = word.replace("The Gambia", "Gambia, The");
257

  
258
		if (!word.contains("El Salvador")){
259
			word = word.replace("Salvador", "El Salvador");
260
		}
261
		word = word.replace("Vera Cruz", "Veracruz");
262
		word = word.replace("Turkmenia", "Turkmenistan");
263
		word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
264
		word = word.replace("Quebeck", "Qu\u00E9bec");
265
		word = word.replace("Quebec", "Qu\u00E9bec");
266

  
267
		if (!word.contains("Gambia, The")){
268
			word = word.replace("Gambia", "Gambia, The");
269
		}
270
		word = word.replace("Mariana Is.", "Marianas");
271
		word = word.replace("Kenia", "Kenya");
272
		word = word.replace("Central Africa", "Central African Republic");
273
		word = word.replace("Canal Zone", "");
274
		//word = word.replace("Panama", "Panamá");
275
		word = word.replace("Panama", "Panam\u00E1");
276
		if (! word.contains("New South Wales")){
277
			word = word.replace("Wales", "Great Britain");
278
		}
279
		word = word.replace("Java", "Jawa");
280
		word = word.replace("former USSR: North European territory", "North European Russia");
281
		word = word.replace("former USSR: South European territory", "South European Russia");
282
		word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
283

  
284
		word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
285

  
286
		word = word.replace("oceanian islands", "Pacific");
287
		word = word.replace("Ussuri region", "Primorye");
288
		word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
289
		word = word.replace("Tarapac\u00E1", "Tarapaca");
290
		word = word.replace("Reunion", "R\u00E9union");
291
		if (! word.contains("Is.")){
292
			word = word.replace("Galapagos", "Gal\u00E1pagos");
293
		}
294

  
295
		//word = word.replace("Galapagos Is.", "Galápagos");
296
		if (! word.contains("Peninsular")){
297
			word = word.replace("Malaysia", "Peninsular Malaysia");
298
		}
299
		word = word.replace("Polynesic Is.", "South Solomons");
300

  
301
		word = word.replace("Usbek SSR", "Uzbekistan");
302
		word = word.replace("Mexican amber", "Mexico");
303
		word = word.replace("Marocco", "Morocco");
304
		if (! word.contains("Tobago")){
305
			word = word.replace("Trinidad", "Trinidad-Tobago");
306
		}
307
		if (! word.contains("Trinidad")){
308
			word = word.replace("Tobago", "Trinidad-Tobago");
309
		}
310
		word = word.replace("Haiti", "Haiti");
311
		word = word.replace("Moluccas", "Maluku");
312
		word = word.replace("Belau", "Palau");
313
		word = word.replace("Dominican amber", "Dominican Republic");
314
		if (! word.contains("Russian")){
315
			word = word.replace("Far East", "Russian Far East");
316
		}
317
		word = word.replace("Tahiti", "Society Is.");
318
		word = word.replace("Iraque", "Iraq");
319
		word = word.replace("Wake Island", "Wake I.");
320
		if (! word.contains("I.")){
321
			word = word.replace("Johnston I", "Johnston I.");
322
			word = word.replace("Wake I", "Wake I.");
323
			word = word.replace("Clipperton I", "Clipperton I.");
324
		}
325
		if (! word.contains("Provinces")){
326
			word = word.replace("Cape Province", "Cape Provinces");
327
		}
328
		word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
329
		word = word.replace("Western Cape Provinces", "Western Cape Province");
330
		if (! word.contains("Barbuda")){
331
			word = word.replace("Antigua", "Antigua-Barbuda");
332
		}
333
		if (! word.contains("St.")){
334
			word = word.replace("St Vincent", "St.Vincent");
335
			word = word.replace("St Lucia", "St.Lucia");
336
			word = word.replace("St Helena", "St.Helena");
337
		}
338
		word = word.replace("Asia-tropical", "Asia-Tropical");
339
		word = word.replace("Society Islands", "Society Is.");
340
		word = word.replace("Virgin Islands", "Virgin Is.");
341
		word = word.replace("Canary Islands", "Canary Is.");
342
		word = word.replace("Rhode Island", "Rhode I.");
343

  
344

  
345
		word = word.replace("Rodriguez", "Rodrigues");
346
		word = word.replace("British Colombia", "British Columbia");
347
		word = word.replace("Bermudas", "Bermuda");
348
		word = word.replace("Tunesia", "Tunisia");
349
		word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
350
		word = word.replace("Transvaal", "Northern Provinces");
351
		word = word.replace("Tucum\u00E1n", "Tucuman");
352
//		if (!word.contains("Netherlands")){
353
//
354
//		}
355

  
356
//		unknownAreas.add("Baltic amber");
357
//		unknownAreas.add("Arabia");
358

  
359
		for (String stopWord : stopWords){
360
			if (stopWord.equals(word)){
361
				System.out.println("         STOP: " + word);
362
				return "";
363
			}
364
		}
365
		for (String unknownArea : unknownAreas){
366
			if (unknownArea.equals(word)){
367
				System.out.println("         UNKNOWN: " + word);
368
				return "";
369
			}
370
		}
371
		for (String higherArea : higherAreas){
372
			if (higherArea.equals(word)){
373
				return "";
374
			}
375
		}
376

  
377
		//higher regions
378

  
379
		return word;
380
	}
381

  
382
	private void initStopWords(){
383
		stopWords.add("and");
384
		stopWords.add("Is");
385
		stopWords.add("Is.");
386
		stopWords.add("Islands");
387
		stopWords.add("Island");
388

  
389
		stopWords.add("of");
390
		stopWords.add("areas");
391
		stopWords.add("USA");
392
		stopWords.add("Australia"); //except for Australia only
393
		stopWords.add("Argentina");
394

  
395
		//unknownAreas.add("Panama");
396
		unknownAreas.add("South Africa");
397
		unknownAreas.add("Chile");
398

  
399
		unknownAreas.add("Baltic amber");
400
		unknownAreas.add("Arabia");
401

  
402

  
403
		higherAreas.add("AF");
404
		higherAreas.add("OR");
405
		higherAreas.add("PA");
406
		higherAreas.add("AU");
407
		higherAreas.add("NE");
408

  
409
		higherAreas.add("NT");
410
	}
411

  
412

  
413
	/**
414
	 * @param args
415
	 */
416
	public static void main(String[] args) {
417
		CdmApplicationController app = null;
418
		DbSchemaValidation val = DbSchemaValidation.UPDATE;
419
		app = CdmApplicationController.NewInstance(cdmDestination, val);
420

  
421
		DipteraDistributionParser dipDist = new DipteraDistributionParser();
422
		if (app != null){
423
			dipDist.doDistribution(app);
424
		}else{
425
			logger.warn("No Application Context");
426
		}
427
	}
428
}

Also available in: Unified diff