Revision 275ada95
Added by Andreas Müller almost 9 years ago
app-import/src/main/java/eu/etaxonomy/cdm/app/wp6/diptera/DipteraCollectionImport.java | ||
---|---|---|
1 |
// $Id$ |
|
2 |
/** |
|
3 |
* Copyright (C) 2007 EDIT |
|
4 |
* European Distributed Institute of Taxonomy |
|
5 |
* http://www.e-taxonomy.eu |
|
6 |
* |
|
7 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
8 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
9 |
*/ |
|
10 |
package eu.etaxonomy.cdm.app.wp6.diptera; |
|
11 |
|
|
12 |
import java.io.File; |
|
13 |
import java.io.FileInputStream; |
|
14 |
import java.io.InputStream; |
|
15 |
import java.io.InputStreamReader; |
|
16 |
import java.util.ArrayList; |
|
17 |
import java.util.HashMap; |
|
18 |
import java.util.List; |
|
19 |
import java.util.Map; |
|
20 |
|
|
21 |
import org.apache.commons.lang.StringUtils; |
|
22 |
import org.apache.log4j.Logger; |
|
23 |
import org.springframework.transaction.TransactionStatus; |
|
24 |
|
|
25 |
import au.com.bytecode.opencsv.CSVReader; |
|
26 |
import eu.etaxonomy.cdm.api.application.CdmApplicationController; |
|
27 |
import eu.etaxonomy.cdm.app.common.CdmDestinations; |
|
28 |
import eu.etaxonomy.cdm.common.CdmUtils; |
|
29 |
import eu.etaxonomy.cdm.database.DbSchemaValidation; |
|
30 |
import eu.etaxonomy.cdm.database.ICdmDataSource; |
|
31 |
import eu.etaxonomy.cdm.model.agent.Institution; |
|
32 |
import eu.etaxonomy.cdm.model.occurrence.Collection; |
|
33 |
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit; |
|
34 |
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase; |
|
35 |
|
|
36 |
/** |
|
37 |
* @author a.mueller |
|
38 |
* @date 07.04.2010 |
|
39 |
* |
|
40 |
*/ |
|
41 |
public class DipteraCollectionImport { |
|
42 |
private static final Logger logger = Logger.getLogger(DipteraCollectionImport.class); |
|
43 |
|
|
44 |
public static final File acronymsFile = new File("src/main/resources/collections/Acronyms.tab"); |
|
45 |
//datasource for use from local main() |
|
46 |
static final ICdmDataSource cdmDestination = CdmDestinations.cdm_local_diptera(); |
|
47 |
|
|
48 |
|
|
49 |
public boolean invoke(ICdmDataSource dataSource) { |
|
50 |
CdmApplicationController cdmApp = CdmApplicationController.NewInstance(dataSource, DbSchemaValidation.VALIDATE); |
|
51 |
|
|
52 |
//create collections |
|
53 |
TransactionStatus tx = cdmApp.startTransaction(); |
|
54 |
Map<String, Collection> colletionMap = createCollections(cdmApp); |
|
55 |
|
|
56 |
//add collections to specimen |
|
57 |
addCollectionsToSpecimen(cdmApp, colletionMap); |
|
58 |
cdmApp.commitTransaction(tx); |
|
59 |
|
|
60 |
return true; |
|
61 |
|
|
62 |
} |
|
63 |
|
|
64 |
|
|
65 |
/** |
|
66 |
* @param cdmApp |
|
67 |
* @param colletionMap |
|
68 |
*/ |
|
69 |
private void addCollectionsToSpecimen(CdmApplicationController cdmApp, Map<String, Collection> colletionMap) { |
|
70 |
List<DerivedUnit> specimens = cdmApp.getOccurrenceService().list(DerivedUnit.class, null, null, null, null); |
|
71 |
for (SpecimenOrObservationBase<?> specOrObservBase : specimens){ |
|
72 |
if (specOrObservBase.getRecordBasis().isPreservedSpecimen()){ |
|
73 |
handleSingleSpecimen((DerivedUnit)specOrObservBase, colletionMap); |
|
74 |
}else{ |
|
75 |
logger.warn("There are specimenOrObservationBase objects which are not of class Specimen. This is probably an error."); |
|
76 |
} |
|
77 |
} |
|
78 |
List<SpecimenOrObservationBase> specimenList = new ArrayList<SpecimenOrObservationBase>(specimens); |
|
79 |
cdmApp.getOccurrenceService().save(specimenList); |
|
80 |
} |
|
81 |
|
|
82 |
|
|
83 |
/** |
|
84 |
* @param specimen |
|
85 |
* @param colletionMap |
|
86 |
*/ |
|
87 |
private void handleSingleSpecimen(DerivedUnit specimen, Map<String, Collection> collectionMap) { |
|
88 |
String titleCache = specimen.getTitleCache(); |
|
89 |
String collectionCode = getCollectionCode(titleCache); |
|
90 |
if (StringUtils.isBlank(collectionCode)){ |
|
91 |
logger.warn("Collection code is empty for: " + titleCache); |
|
92 |
}else{ |
|
93 |
Collection collection = collectionMap.get(collectionCode); |
|
94 |
if (collection != null){ |
|
95 |
specimen.setCollection(collection); |
|
96 |
}else{ |
|
97 |
logger.warn("Collection not found for code: " + collectionCode + "; titleCache: " + titleCache); |
|
98 |
} |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
|
|
103 |
/** |
|
104 |
* @param titleCache |
|
105 |
* @return |
|
106 |
*/ |
|
107 |
private String getCollectionCode(String titleCache) { |
|
108 |
String result = titleCache.trim(); |
|
109 |
result = replaceBracket(result); |
|
110 |
result = replaceLastFullStop(result); |
|
111 |
result = replaceLastQuestionMark(result); |
|
112 |
result = parseLastUpperCase(result); |
|
113 |
return result; |
|
114 |
} |
|
115 |
|
|
116 |
|
|
117 |
/** |
|
118 |
* @param result |
|
119 |
* @return |
|
120 |
*/ |
|
121 |
private String parseLastUpperCase(String string) { |
|
122 |
String result = ""; |
|
123 |
String tmpString = string; |
|
124 |
int pos = tmpString.lastIndexOf(" "); |
|
125 |
if (pos>-1){ |
|
126 |
tmpString = tmpString.substring(pos+1); |
|
127 |
} |
|
128 |
while (tmpString.length() > 0){ |
|
129 |
int len = tmpString.length(); |
|
130 |
char lastChar = tmpString.charAt(len-1); |
|
131 |
if (Character.isUpperCase( lastChar)){ |
|
132 |
result = lastChar + result; |
|
133 |
}else{ |
|
134 |
if (result.length() > 0){ |
|
135 |
logger.warn("Collection code is not space separated: " + string); |
|
136 |
} |
|
137 |
break; |
|
138 |
} |
|
139 |
//remove last character |
|
140 |
tmpString = tmpString.substring(0, tmpString.length()-1); |
|
141 |
} |
|
142 |
return result; |
|
143 |
} |
|
144 |
|
|
145 |
|
|
146 |
|
|
147 |
/** |
|
148 |
* @param result |
|
149 |
* @return |
|
150 |
*/ |
|
151 |
private String replaceLastQuestionMark(String string) { |
|
152 |
if (string.endsWith("?")){ |
|
153 |
string = string.substring(0,string.length()-1).trim(); |
|
154 |
} |
|
155 |
return string; |
|
156 |
} |
|
157 |
|
|
158 |
/** |
|
159 |
* @param result |
|
160 |
* @return |
|
161 |
*/ |
|
162 |
private String replaceLastFullStop(String string) { |
|
163 |
if (string.endsWith(".")){ |
|
164 |
string = string.substring(0,string.length()-1).trim(); |
|
165 |
} |
|
166 |
return string; |
|
167 |
} |
|
168 |
|
|
169 |
|
|
170 |
/** |
|
171 |
* @param result |
|
172 |
* @return |
|
173 |
*/ |
|
174 |
private String replaceBracket(String string) { |
|
175 |
if (string.endsWith("]")){ |
|
176 |
int pos = string.indexOf("["); |
|
177 |
if (pos >0){ |
|
178 |
string = string.substring(0, pos).trim(); |
|
179 |
}else{ |
|
180 |
logger.warn("Closing bracket has no opening bracket in: " + string); |
|
181 |
} |
|
182 |
} |
|
183 |
return string; |
|
184 |
} |
|
185 |
|
|
186 |
|
|
187 |
/** |
|
188 |
* @param cdmApp |
|
189 |
*/ |
|
190 |
private Map<String, Collection> createCollections(CdmApplicationController cdmApp) { |
|
191 |
Map<String, Collection> collectionMap = new HashMap<String, Collection>(); |
|
192 |
List<String[]> lines = getLines(); |
|
193 |
for (String[] line:lines){ |
|
194 |
Collection collection = makeLine(line); |
|
195 |
collectionMap.put(collection.getCode(), collection); |
|
196 |
} |
|
197 |
cdmApp.getCollectionService().save(collectionMap.values()); |
|
198 |
// for (Collection collection: collectionMap.values()){ |
|
199 |
// System.out.println(collection.getTitleCache()); |
|
200 |
// } |
|
201 |
return collectionMap; |
|
202 |
} |
|
203 |
|
|
204 |
|
|
205 |
private Collection makeLine(String[] line) { |
|
206 |
String code = line[0]; |
|
207 |
String instituteName = line[1]; |
|
208 |
String lowerInstitutionName = line[2]; |
|
209 |
String higherInstitutionName = line[3]; |
|
210 |
String location = line[4]; |
|
211 |
String country = line[5]; |
|
212 |
//create objects |
|
213 |
Collection collection = Collection.NewInstance(); |
|
214 |
collection.setCode(code); |
|
215 |
Institution institution = Institution.NewInstance(); |
|
216 |
institution.setCode(code); |
|
217 |
|
|
218 |
institution.setName(instituteName); |
|
219 |
|
|
220 |
if (StringUtils.isNotBlank(lowerInstitutionName)){ |
|
221 |
Institution lowerInstitution = Institution.NewInstance(); |
|
222 |
lowerInstitution.setName(lowerInstitutionName); |
|
223 |
lowerInstitution.setIsPartOf(institution); |
|
224 |
} |
|
225 |
|
|
226 |
if (StringUtils.isNotBlank(higherInstitutionName)){ |
|
227 |
Institution higherInstitution = Institution.NewInstance(); |
|
228 |
higherInstitution.setName(higherInstitutionName); |
|
229 |
institution.setIsPartOf(higherInstitution); |
|
230 |
} |
|
231 |
|
|
232 |
collection.setInstitute(institution); |
|
233 |
String locationAndCountry = CdmUtils.concat("/", location, country); |
|
234 |
collection.setTownOrLocation(locationAndCountry); |
|
235 |
|
|
236 |
String titleCache = CdmUtils.concat(", ", new String[]{instituteName, lowerInstitutionName, higherInstitutionName, location, country}); |
|
237 |
collection.setTitleCache(titleCache, true); |
|
238 |
|
|
239 |
return collection; |
|
240 |
} |
|
241 |
|
|
242 |
|
|
243 |
|
|
244 |
|
|
245 |
private List<String[]> getLines() { |
|
246 |
List<String[]> result = new ArrayList<String[]>(); |
|
247 |
|
|
248 |
try { |
|
249 |
InputStream inStream = new FileInputStream(acronymsFile); |
|
250 |
InputStreamReader inputStreamReader = new InputStreamReader(inStream, "UTF8"); |
|
251 |
CSVReader reader = new CSVReader(inputStreamReader, '\t'); |
|
252 |
String [] nextLine = reader.readNext(); |
|
253 |
|
|
254 |
|
|
255 |
while ((nextLine = reader.readNext()) != null) { |
|
256 |
if (nextLine.length == 0){ |
|
257 |
continue; |
|
258 |
} |
|
259 |
result.add(nextLine); |
|
260 |
} |
|
261 |
return result; |
|
262 |
} catch (Exception e) { |
|
263 |
logger.error(e + " " + e.getCause() + " " + e.getMessage()); |
|
264 |
for(StackTraceElement ste : e.getStackTrace()) { |
|
265 |
logger.error(ste); |
|
266 |
} |
|
267 |
throw new RuntimeException(e); |
|
268 |
} |
|
269 |
} |
|
270 |
|
|
271 |
|
|
272 |
|
|
273 |
|
|
274 |
|
|
275 |
/** |
|
276 |
* @param args |
|
277 |
*/ |
|
278 |
public static void main(String[] args) { |
|
279 |
try { |
|
280 |
DipteraCollectionImport collectionImport = new DipteraCollectionImport(); |
|
281 |
collectionImport.invoke(cdmDestination); |
|
282 |
// String titleCache = "Peru. Mouth of Rio Pachitea. ST 2R SMT. [fig. of male abdomen]"; |
|
283 |
// String collectionCode = collectionImport.getCollectionCode(titleCache); |
|
284 |
// System.out.println(collectionCode); |
|
285 |
} catch (Exception e) { |
|
286 |
e.printStackTrace(); |
|
287 |
System.exit(-1); |
|
288 |
} |
|
289 |
} |
|
290 |
|
|
291 |
} |
|
1 |
// $Id$ |
|
2 |
/** |
|
3 |
* Copyright (C) 2007 EDIT |
|
4 |
* European Distributed Institute of Taxonomy |
|
5 |
* http://www.e-taxonomy.eu |
|
6 |
* |
|
7 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
8 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
9 |
*/ |
|
10 |
package eu.etaxonomy.cdm.app.wp6.diptera; |
|
11 |
|
|
12 |
import java.io.File; |
|
13 |
import java.io.FileInputStream; |
|
14 |
import java.io.InputStream; |
|
15 |
import java.io.InputStreamReader; |
|
16 |
import java.util.ArrayList; |
|
17 |
import java.util.HashMap; |
|
18 |
import java.util.List; |
|
19 |
import java.util.Map; |
|
20 |
|
|
21 |
import org.apache.commons.lang.StringUtils; |
|
22 |
import org.apache.log4j.Logger; |
|
23 |
import org.springframework.transaction.TransactionStatus; |
|
24 |
|
|
25 |
import au.com.bytecode.opencsv.CSVReader; |
|
26 |
import eu.etaxonomy.cdm.api.application.CdmApplicationController; |
|
27 |
import eu.etaxonomy.cdm.app.common.CdmDestinations; |
|
28 |
import eu.etaxonomy.cdm.common.CdmUtils; |
|
29 |
import eu.etaxonomy.cdm.database.DbSchemaValidation; |
|
30 |
import eu.etaxonomy.cdm.database.ICdmDataSource; |
|
31 |
import eu.etaxonomy.cdm.model.agent.Institution; |
|
32 |
import eu.etaxonomy.cdm.model.occurrence.Collection; |
|
33 |
import eu.etaxonomy.cdm.model.occurrence.DerivedUnit; |
|
34 |
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase; |
|
35 |
|
|
36 |
/** |
|
37 |
* @author a.mueller |
|
38 |
* @date 07.04.2010 |
|
39 |
* |
|
40 |
*/ |
|
41 |
public class DipteraCollectionImport { |
|
42 |
private static final Logger logger = Logger.getLogger(DipteraCollectionImport.class); |
|
43 |
|
|
44 |
public static final File acronymsFile = new File("src/main/resources/collections/Acronyms.tab"); |
|
45 |
//datasource for use from local main() |
|
46 |
static final ICdmDataSource cdmDestination = CdmDestinations.localH2(); |
|
47 |
|
|
48 |
|
|
49 |
public boolean invoke(ICdmDataSource dataSource) { |
|
50 |
CdmApplicationController cdmApp = CdmApplicationController.NewInstance(dataSource, DbSchemaValidation.VALIDATE); |
|
51 |
|
|
52 |
//create collections |
|
53 |
TransactionStatus tx = cdmApp.startTransaction(); |
|
54 |
Map<String, Collection> colletionMap = createCollections(cdmApp); |
|
55 |
|
|
56 |
//add collections to specimen |
|
57 |
addCollectionsToSpecimen(cdmApp, colletionMap); |
|
58 |
cdmApp.commitTransaction(tx); |
|
59 |
|
|
60 |
return true; |
|
61 |
|
|
62 |
} |
|
63 |
|
|
64 |
|
|
65 |
/** |
|
66 |
* @param cdmApp |
|
67 |
* @param colletionMap |
|
68 |
*/ |
|
69 |
private void addCollectionsToSpecimen(CdmApplicationController cdmApp, Map<String, Collection> colletionMap) { |
|
70 |
List<DerivedUnit> specimens = cdmApp.getOccurrenceService().list(DerivedUnit.class, null, null, null, null); |
|
71 |
for (SpecimenOrObservationBase<?> specOrObservBase : specimens){ |
|
72 |
if (specOrObservBase.getRecordBasis().isPreservedSpecimen()){ |
|
73 |
handleSingleSpecimen((DerivedUnit)specOrObservBase, colletionMap); |
|
74 |
}else{ |
|
75 |
logger.warn("There are specimenOrObservationBase objects which are not of class Specimen. This is probably an error."); |
|
76 |
} |
|
77 |
} |
|
78 |
List<SpecimenOrObservationBase> specimenList = new ArrayList<SpecimenOrObservationBase>(specimens); |
|
79 |
cdmApp.getOccurrenceService().save(specimenList); |
|
80 |
} |
|
81 |
|
|
82 |
|
|
83 |
/** |
|
84 |
* @param specimen |
|
85 |
* @param colletionMap |
|
86 |
*/ |
|
87 |
private void handleSingleSpecimen(DerivedUnit specimen, Map<String, Collection> collectionMap) { |
|
88 |
String titleCache = specimen.getTitleCache(); |
|
89 |
String collectionCode = getCollectionCode(titleCache); |
|
90 |
if (StringUtils.isBlank(collectionCode)){ |
|
91 |
logger.warn("Collection code is empty for: " + titleCache); |
|
92 |
}else{ |
|
93 |
Collection collection = collectionMap.get(collectionCode); |
|
94 |
if (collection != null){ |
|
95 |
specimen.setCollection(collection); |
|
96 |
}else{ |
|
97 |
logger.warn("Collection not found for code: " + collectionCode + "; titleCache: " + titleCache); |
|
98 |
} |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
|
|
103 |
/** |
|
104 |
* @param titleCache |
|
105 |
* @return |
|
106 |
*/ |
|
107 |
private String getCollectionCode(String titleCache) { |
|
108 |
String result = titleCache.trim(); |
|
109 |
result = replaceBracket(result); |
|
110 |
result = replaceLastFullStop(result); |
|
111 |
result = replaceLastQuestionMark(result); |
|
112 |
result = parseLastUpperCase(result); |
|
113 |
return result; |
|
114 |
} |
|
115 |
|
|
116 |
|
|
117 |
/** |
|
118 |
* @param result |
|
119 |
* @return |
|
120 |
*/ |
|
121 |
private String parseLastUpperCase(String string) { |
|
122 |
String result = ""; |
|
123 |
String tmpString = string; |
|
124 |
int pos = tmpString.lastIndexOf(" "); |
|
125 |
if (pos>-1){ |
|
126 |
tmpString = tmpString.substring(pos+1); |
|
127 |
} |
|
128 |
while (tmpString.length() > 0){ |
|
129 |
int len = tmpString.length(); |
|
130 |
char lastChar = tmpString.charAt(len-1); |
|
131 |
if (Character.isUpperCase( lastChar)){ |
|
132 |
result = lastChar + result; |
|
133 |
}else{ |
|
134 |
if (result.length() > 0){ |
|
135 |
logger.warn("Collection code is not space separated: " + string); |
|
136 |
} |
|
137 |
break; |
|
138 |
} |
|
139 |
//remove last character |
|
140 |
tmpString = tmpString.substring(0, tmpString.length()-1); |
|
141 |
} |
|
142 |
return result; |
|
143 |
} |
|
144 |
|
|
145 |
|
|
146 |
|
|
147 |
/** |
|
148 |
* @param result |
|
149 |
* @return |
|
150 |
*/ |
|
151 |
private String replaceLastQuestionMark(String string) { |
|
152 |
if (string.endsWith("?")){ |
|
153 |
string = string.substring(0,string.length()-1).trim(); |
|
154 |
} |
|
155 |
return string; |
|
156 |
} |
|
157 |
|
|
158 |
/** |
|
159 |
* @param result |
|
160 |
* @return |
|
161 |
*/ |
|
162 |
private String replaceLastFullStop(String string) { |
|
163 |
if (string.endsWith(".")){ |
|
164 |
string = string.substring(0,string.length()-1).trim(); |
|
165 |
} |
|
166 |
return string; |
|
167 |
} |
|
168 |
|
|
169 |
|
|
170 |
/** |
|
171 |
* @param result |
|
172 |
* @return |
|
173 |
*/ |
|
174 |
private String replaceBracket(String string) { |
|
175 |
if (string.endsWith("]")){ |
|
176 |
int pos = string.indexOf("["); |
|
177 |
if (pos >0){ |
|
178 |
string = string.substring(0, pos).trim(); |
|
179 |
}else{ |
|
180 |
logger.warn("Closing bracket has no opening bracket in: " + string); |
|
181 |
} |
|
182 |
} |
|
183 |
return string; |
|
184 |
} |
|
185 |
|
|
186 |
|
|
187 |
/** |
|
188 |
* @param cdmApp |
|
189 |
*/ |
|
190 |
private Map<String, Collection> createCollections(CdmApplicationController cdmApp) { |
|
191 |
Map<String, Collection> collectionMap = new HashMap<String, Collection>(); |
|
192 |
List<String[]> lines = getLines(); |
|
193 |
for (String[] line:lines){ |
|
194 |
Collection collection = makeLine(line); |
|
195 |
collectionMap.put(collection.getCode(), collection); |
|
196 |
} |
|
197 |
cdmApp.getCollectionService().save(collectionMap.values()); |
|
198 |
// for (Collection collection: collectionMap.values()){ |
|
199 |
// System.out.println(collection.getTitleCache()); |
|
200 |
// } |
|
201 |
return collectionMap; |
|
202 |
} |
|
203 |
|
|
204 |
|
|
205 |
private Collection makeLine(String[] line) { |
|
206 |
String code = line[0]; |
|
207 |
String instituteName = line[1]; |
|
208 |
String lowerInstitutionName = line[2]; |
|
209 |
String higherInstitutionName = line[3]; |
|
210 |
String location = line[4]; |
|
211 |
String country = line[5]; |
|
212 |
//create objects |
|
213 |
Collection collection = Collection.NewInstance(); |
|
214 |
collection.setCode(code); |
|
215 |
Institution institution = Institution.NewInstance(); |
|
216 |
institution.setCode(code); |
|
217 |
|
|
218 |
institution.setName(instituteName); |
|
219 |
|
|
220 |
if (StringUtils.isNotBlank(lowerInstitutionName)){ |
|
221 |
Institution lowerInstitution = Institution.NewInstance(); |
|
222 |
lowerInstitution.setName(lowerInstitutionName); |
|
223 |
lowerInstitution.setIsPartOf(institution); |
|
224 |
} |
|
225 |
|
|
226 |
if (StringUtils.isNotBlank(higherInstitutionName)){ |
|
227 |
Institution higherInstitution = Institution.NewInstance(); |
|
228 |
higherInstitution.setName(higherInstitutionName); |
|
229 |
institution.setIsPartOf(higherInstitution); |
|
230 |
} |
|
231 |
|
|
232 |
collection.setInstitute(institution); |
|
233 |
String locationAndCountry = CdmUtils.concat("/", location, country); |
|
234 |
collection.setTownOrLocation(locationAndCountry); |
|
235 |
|
|
236 |
String titleCache = CdmUtils.concat(", ", new String[]{instituteName, lowerInstitutionName, higherInstitutionName, location, country}); |
|
237 |
collection.setTitleCache(titleCache, true); |
|
238 |
|
|
239 |
return collection; |
|
240 |
} |
|
241 |
|
|
242 |
|
|
243 |
|
|
244 |
|
|
245 |
private List<String[]> getLines() { |
|
246 |
List<String[]> result = new ArrayList<String[]>(); |
|
247 |
|
|
248 |
try { |
|
249 |
InputStream inStream = new FileInputStream(acronymsFile); |
|
250 |
InputStreamReader inputStreamReader = new InputStreamReader(inStream, "UTF8"); |
|
251 |
CSVReader reader = new CSVReader(inputStreamReader, '\t'); |
|
252 |
String [] nextLine = reader.readNext(); |
|
253 |
|
|
254 |
|
|
255 |
while ((nextLine = reader.readNext()) != null) { |
|
256 |
if (nextLine.length == 0){ |
|
257 |
continue; |
|
258 |
} |
|
259 |
result.add(nextLine); |
|
260 |
} |
|
261 |
return result; |
|
262 |
} catch (Exception e) { |
|
263 |
logger.error(e + " " + e.getCause() + " " + e.getMessage()); |
|
264 |
for(StackTraceElement ste : e.getStackTrace()) { |
|
265 |
logger.error(ste); |
|
266 |
} |
|
267 |
throw new RuntimeException(e); |
|
268 |
} |
|
269 |
} |
|
270 |
|
|
271 |
|
|
272 |
|
|
273 |
|
|
274 |
|
|
275 |
/** |
|
276 |
* @param args |
|
277 |
*/ |
|
278 |
public static void main(String[] args) { |
|
279 |
try { |
|
280 |
DipteraCollectionImport collectionImport = new DipteraCollectionImport(); |
|
281 |
collectionImport.invoke(cdmDestination); |
|
282 |
// String titleCache = "Peru. Mouth of Rio Pachitea. ST 2R SMT. [fig. of male abdomen]"; |
|
283 |
// String collectionCode = collectionImport.getCollectionCode(titleCache); |
|
284 |
// System.out.println(collectionCode); |
|
285 |
} catch (Exception e) { |
|
286 |
e.printStackTrace(); |
|
287 |
System.exit(-1); |
|
288 |
} |
|
289 |
} |
|
290 |
|
|
291 |
} |
app-import/src/main/java/eu/etaxonomy/cdm/app/wp6/diptera/DipteraDistributionParser.java | ||
---|---|---|
1 |
/** |
|
2 |
* Copyright (C) 2007 EDIT |
|
3 |
* European Distributed Institute of Taxonomy |
|
4 |
* http://www.e-taxonomy.eu |
|
5 |
* |
|
6 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
7 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
8 |
*/ |
|
9 |
|
|
10 |
/** |
|
11 |
* Copyright (C) 2007 EDIT |
|
12 |
* European Distributed Institute of Taxonomy |
|
13 |
* http://www.e-taxonomy.eu |
|
14 |
* |
|
15 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
16 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
17 |
*/ |
|
18 |
package eu.etaxonomy.cdm.app.wp6.diptera; |
|
19 |
|
|
20 |
import java.util.ArrayList; |
|
21 |
import java.util.HashSet; |
|
22 |
import java.util.List; |
|
23 |
import java.util.Set; |
|
24 |
import java.util.regex.Pattern; |
|
25 |
|
|
26 |
import org.apache.log4j.Logger; |
|
27 |
import org.springframework.transaction.TransactionStatus; |
|
28 |
|
|
29 |
import eu.etaxonomy.cdm.api.application.CdmApplicationController; |
|
30 |
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration; |
|
31 |
import eu.etaxonomy.cdm.app.common.CdmDestinations; |
|
32 |
import eu.etaxonomy.cdm.database.DbSchemaValidation; |
|
33 |
import eu.etaxonomy.cdm.database.ICdmDataSource; |
|
34 |
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider; |
|
35 |
import eu.etaxonomy.cdm.model.common.Language; |
|
36 |
import eu.etaxonomy.cdm.model.description.DescriptionBase; |
|
37 |
import eu.etaxonomy.cdm.model.description.DescriptionElementBase; |
|
38 |
import eu.etaxonomy.cdm.model.description.Distribution; |
|
39 |
import eu.etaxonomy.cdm.model.description.Feature; |
|
40 |
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm; |
|
41 |
import eu.etaxonomy.cdm.model.description.TaxonDescription; |
|
42 |
import eu.etaxonomy.cdm.model.description.TextData; |
|
43 |
import eu.etaxonomy.cdm.model.location.NamedArea; |
|
44 |
import eu.etaxonomy.cdm.model.taxon.Taxon; |
|
45 |
import eu.etaxonomy.cdm.model.taxon.TaxonBase; |
|
46 |
|
|
47 |
/** |
|
48 |
* @author a.mueller |
|
49 |
* @created 17.10.2008 |
|
50 |
* @version 1.0 |
|
51 |
*/ |
|
52 |
public class DipteraDistributionParser { |
|
53 |
private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class); |
|
54 |
|
|
55 |
private static ICdmDataSource cdmDestination = CdmDestinations.cdm_local_diptera(); |
|
56 |
|
|
57 |
final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' ) |
|
58 |
static Pattern pattern = null; |
|
59 |
|
|
60 |
protected void doDistribution(ICdmApplicationConfiguration app){ |
|
61 |
pattern = Pattern.compile(epiSplitter); |
|
62 |
TransactionStatus txStatus = app.startTransaction(); |
|
63 |
List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null); |
|
64 |
for (TaxonBase taxon: taxa ){ |
|
65 |
if (taxon instanceof Taxon){ |
|
66 |
// unlazyDescription(app, (Taxon)taxon); |
|
67 |
Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions(); |
|
68 |
for (DescriptionBase description: descriptions){ |
|
69 |
Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>(); |
|
70 |
descElements.addAll(description.getElements()); |
|
71 |
|
|
72 |
for (DescriptionElementBase descEl: descElements){ |
|
73 |
if (descEl.getFeature().equals(Feature.OCCURRENCE())){ |
|
74 |
if (descEl instanceof TextData){ |
|
75 |
String occString = ((TextData)descEl).getText(Language.ENGLISH()); |
|
76 |
parseOccurenceString(occString, description); |
|
77 |
//app.getTaxonService().saveTaxon(taxon); |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
} |
|
83 |
} |
|
84 |
System.out.println("Unknowns: "); |
|
85 |
for (String unknown: unrekognizedStrings){ |
|
86 |
System.out.println(unknown); |
|
87 |
} |
|
88 |
System.out.println("Distributions not recognized: " + countNot); |
|
89 |
System.out.println("Distributions created: " + countYes); |
|
90 |
app.commitTransaction(txStatus); |
|
91 |
} |
|
92 |
|
|
93 |
static Set<String> unrekognizedStrings = new HashSet<String>(); |
|
94 |
static int countNot = 0; |
|
95 |
static int countYes = 0; |
|
96 |
|
|
97 |
private void parseOccurenceString(String occString, DescriptionBase desc){ |
|
98 |
System.out.println(occString); |
|
99 |
if (occString != null){ |
|
100 |
String[] words = pattern.split(occString); |
|
101 |
int i = 0; |
|
102 |
int countSkip = 0; |
|
103 |
for (String word: words){ |
|
104 |
if (word.contains("U.S.A")){ |
|
105 |
logger.warn("U.S.A."); |
|
106 |
} |
|
107 |
boolean isDoubtful = false; |
|
108 |
if (countSkip > 0){ |
|
109 |
countSkip--; |
|
110 |
}else if(word.trim().length() == 0){ |
|
111 |
//skip |
|
112 |
}else{ |
|
113 |
if (word.endsWith(":") && word.length()<=4){ |
|
114 |
//Higher area |
|
115 |
//TODO |
|
116 |
}else{ |
|
117 |
word = word.trim(); |
|
118 |
if (word.contains("?")){ |
|
119 |
isDoubtful = true; |
|
120 |
word = word.replace("?", ""); |
|
121 |
} |
|
122 |
word = adaptWordsToTdwg(word); |
|
123 |
|
|
124 |
if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){ |
|
125 |
for (countSkip = 1; countSkip <= 6; countSkip++){ |
|
126 |
word = word.trim(); |
|
127 |
if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){ |
|
128 |
if (words.length > i + countSkip){ |
|
129 |
word = word + " " + words[i + countSkip]; |
|
130 |
} |
|
131 |
if (word.contains("?")){ |
|
132 |
isDoubtful = true; |
|
133 |
word = word.replace("?", ""); |
|
134 |
} |
|
135 |
word = adaptWordsToTdwg(word); |
|
136 |
if ("".equals(word)){ |
|
137 |
break; |
|
138 |
} |
|
139 |
}else{ |
|
140 |
break; |
|
141 |
} |
|
142 |
} |
|
143 |
} |
|
144 |
if ("".equals(word)){ |
|
145 |
//countSkip = countSkip; |
|
146 |
}else if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){ |
|
147 |
if (word.contains("?")){ |
|
148 |
logger.warn("XXX"); |
|
149 |
} |
|
150 |
countNot++; |
|
151 |
System.out.println(" False:" + countNot + ": " + word); |
|
152 |
unrekognizedStrings.add(word); |
|
153 |
countSkip = 0; |
|
154 |
}else{ |
|
155 |
if (word.equals("Netherlands")){ |
|
156 |
if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){ |
|
157 |
word = "Netherlands Antilles"; |
|
158 |
countSkip=2; |
|
159 |
} |
|
160 |
} |
|
161 |
PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT(); |
|
162 |
if (isDoubleArea(word)){ |
|
163 |
NamedArea[] doubleArea = getDoubleArea(word); |
|
164 |
for (NamedArea area : doubleArea){ |
|
165 |
Distribution distr = Distribution.NewInstance(area, term); |
|
166 |
desc.addElement(distr); |
|
167 |
} |
|
168 |
}else{ |
|
169 |
NamedArea area; |
|
170 |
if (TdwgAreaProvider.isTdwgAreaLabel(word)){ |
|
171 |
area = TdwgAreaProvider.getAreaByTdwgLabel(word); |
|
172 |
}else{ |
|
173 |
area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word); |
|
174 |
} |
|
175 |
if (isDoubtful){ |
|
176 |
term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE(); |
|
177 |
} |
|
178 |
Distribution distr = Distribution.NewInstance(area, term); |
|
179 |
desc.addElement(distr); |
|
180 |
} |
|
181 |
countYes++; |
|
182 |
System.out.println(" True:" + countYes + ": " + word); |
|
183 |
countSkip--; |
|
184 |
} |
|
185 |
} |
|
186 |
} |
|
187 |
i++; |
|
188 |
} |
|
189 |
} |
|
190 |
} |
|
191 |
|
|
192 |
private boolean isDoubleArea(String word){ |
|
193 |
if ("Canary and Madeira Is.".equalsIgnoreCase(word) || |
|
194 |
"southern Europe".equalsIgnoreCase(word) || |
|
195 |
"former USSR: North and Central European territory".equalsIgnoreCase(word) |
|
196 |
){ |
|
197 |
return true; |
|
198 |
}else{ |
|
199 |
return false; |
|
200 |
} |
|
201 |
} |
|
202 |
|
|
203 |
private NamedArea[] getDoubleArea(String word){ |
|
204 |
NamedArea[] result = new NamedArea[2]; |
|
205 |
if ("Canary and Madeira Is.".equalsIgnoreCase(word)){ |
|
206 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY"); |
|
207 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR"); |
|
208 |
}else if ("southern Europe".equalsIgnoreCase(word)){ |
|
209 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12"); |
|
210 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13"); |
|
211 |
}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){ |
|
212 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO"); |
|
213 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO"); |
|
214 |
}else{ |
|
215 |
logger.warn("Double area not recognized"); |
|
216 |
} |
|
217 |
return result; |
|
218 |
} |
|
219 |
|
|
220 |
|
|
221 |
static List<String> stopWords = new ArrayList<String>(); |
|
222 |
static List<String> unknownAreas = new ArrayList<String>(); |
|
223 |
static List<String> higherAreas = new ArrayList<String>(); |
|
224 |
|
|
225 |
private String adaptWordsToTdwg(String word){ |
|
226 |
word = word.replace(",", "").replace(";", ""); |
|
227 |
if (! word.contains("U.S.A")){ |
|
228 |
word = word.replace(",", "").replace(".", "").replace(";", ""); |
|
229 |
}else{ |
|
230 |
word = word.replace(",", "").replace(";", ""); |
|
231 |
} |
|
232 |
|
|
233 |
word = word.trim(); |
|
234 |
if (word.endsWith("Is")){ |
|
235 |
word = word + "."; |
|
236 |
} |
|
237 |
if (stopWords.size() == 0){ |
|
238 |
initStopWords(); |
|
239 |
} |
|
240 |
|
|
241 |
word = word.replace("Russia [North European territory]", "North European Russia"); |
|
242 |
word = word.replace("Russia North European territory", "North European Russia"); |
|
243 |
word = word.replace("Russia: North European territory", "North European Russia"); |
|
244 |
word = word.replace("Russia: North European territory", "North European Russia"); |
|
245 |
|
|
246 |
word = word.replace("Amber", "amber"); |
|
247 |
|
|
248 |
|
|
249 |
word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is."); |
|
250 |
//or word = word.replace("Prince Edward Is.", "Prince Edward I."); |
|
251 |
word = word.replace("Bahama Is.", "Bahamas"); |
|
252 |
word = word.replace("Comores Is.", "Comoros"); |
|
253 |
word = word.replace("former Yugoslavia", "Yugoslavia"); |
|
254 |
word = word.replace("former Czechoslovakia", "Czechoslovakia"); |
|
255 |
word = word.replace("Rhodesia", "Zimbabwe"); |
|
256 |
word = word.replace("The Gambia", "Gambia, The"); |
|
257 |
|
|
258 |
if (!word.contains("El Salvador")){ |
|
259 |
word = word.replace("Salvador", "El Salvador"); |
|
260 |
} |
|
261 |
word = word.replace("Vera Cruz", "Veracruz"); |
|
262 |
word = word.replace("Turkmenia", "Turkmenistan"); |
|
263 |
word = word.replace("Qu\u00E9beck", "Qu\u00E9bec"); |
|
264 |
word = word.replace("Quebeck", "Qu\u00E9bec"); |
|
265 |
word = word.replace("Quebec", "Qu\u00E9bec"); |
|
266 |
|
|
267 |
if (!word.contains("Gambia, The")){ |
|
268 |
word = word.replace("Gambia", "Gambia, The"); |
|
269 |
} |
|
270 |
word = word.replace("Mariana Is.", "Marianas"); |
|
271 |
word = word.replace("Kenia", "Kenya"); |
|
272 |
word = word.replace("Central Africa", "Central African Republic"); |
|
273 |
word = word.replace("Canal Zone", ""); |
|
274 |
//word = word.replace("Panama", "Panamá"); |
|
275 |
word = word.replace("Panama", "Panam\u00E1"); |
|
276 |
if (! word.contains("New South Wales")){ |
|
277 |
word = word.replace("Wales", "Great Britain"); |
|
278 |
} |
|
279 |
word = word.replace("Java", "Jawa"); |
|
280 |
word = word.replace("former USSR: North European territory", "North European Russia"); |
|
281 |
word = word.replace("former USSR: South European territory", "South European Russia"); |
|
282 |
word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia"); |
|
283 |
|
|
284 |
word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis"); |
|
285 |
|
|
286 |
word = word.replace("oceanian islands", "Pacific"); |
|
287 |
word = word.replace("Ussuri region", "Primorye"); |
|
288 |
word = word.replace("Galapagos Is.", "Gal\u00E1pagos"); |
|
289 |
word = word.replace("Tarapac\u00E1", "Tarapaca"); |
|
290 |
word = word.replace("Reunion", "R\u00E9union"); |
|
291 |
if (! word.contains("Is.")){ |
|
292 |
word = word.replace("Galapagos", "Gal\u00E1pagos"); |
|
293 |
} |
|
294 |
|
|
295 |
//word = word.replace("Galapagos Is.", "Galápagos"); |
|
296 |
if (! word.contains("Peninsular")){ |
|
297 |
word = word.replace("Malaysia", "Peninsular Malaysia"); |
|
298 |
} |
|
299 |
word = word.replace("Polynesic Is.", "South Solomons"); |
|
300 |
|
|
301 |
word = word.replace("Usbek SSR", "Uzbekistan"); |
|
302 |
word = word.replace("Mexican amber", "Mexico"); |
|
303 |
word = word.replace("Marocco", "Morocco"); |
|
304 |
if (! word.contains("Tobago")){ |
|
305 |
word = word.replace("Trinidad", "Trinidad-Tobago"); |
|
306 |
} |
|
307 |
if (! word.contains("Trinidad")){ |
|
308 |
word = word.replace("Tobago", "Trinidad-Tobago"); |
|
309 |
} |
|
310 |
word = word.replace("Haiti", "Haiti"); |
|
311 |
word = word.replace("Moluccas", "Maluku"); |
|
312 |
word = word.replace("Belau", "Palau"); |
|
313 |
word = word.replace("Dominican amber", "Dominican Republic"); |
|
314 |
if (! word.contains("Russian")){ |
|
315 |
word = word.replace("Far East", "Russian Far East"); |
|
316 |
} |
|
317 |
word = word.replace("Tahiti", "Society Is."); |
|
318 |
word = word.replace("Iraque", "Iraq"); |
|
319 |
word = word.replace("Wake Island", "Wake I."); |
|
320 |
if (! word.contains("I.")){ |
|
321 |
word = word.replace("Johnston I", "Johnston I."); |
|
322 |
word = word.replace("Wake I", "Wake I."); |
|
323 |
word = word.replace("Clipperton I", "Clipperton I."); |
|
324 |
} |
|
325 |
if (! word.contains("Provinces")){ |
|
326 |
word = word.replace("Cape Province", "Cape Provinces"); |
|
327 |
} |
|
328 |
word = word.replace("Eastern Cape Provinces", "Eastern Cape Province"); |
|
329 |
word = word.replace("Western Cape Provinces", "Western Cape Province"); |
|
330 |
if (! word.contains("Barbuda")){ |
|
331 |
word = word.replace("Antigua", "Antigua-Barbuda"); |
|
332 |
} |
|
333 |
if (! word.contains("St.")){ |
|
334 |
word = word.replace("St Vincent", "St.Vincent"); |
|
335 |
word = word.replace("St Lucia", "St.Lucia"); |
|
336 |
word = word.replace("St Helena", "St.Helena"); |
|
337 |
} |
|
338 |
word = word.replace("Asia-tropical", "Asia-Tropical"); |
|
339 |
word = word.replace("Society Islands", "Society Is."); |
|
340 |
word = word.replace("Virgin Islands", "Virgin Is."); |
|
341 |
word = word.replace("Canary Islands", "Canary Is."); |
|
342 |
word = word.replace("Rhode Island", "Rhode I."); |
|
343 |
|
|
344 |
|
|
345 |
word = word.replace("Rodriguez", "Rodrigues"); |
|
346 |
word = word.replace("British Colombia", "British Columbia"); |
|
347 |
word = word.replace("Bermudas", "Bermuda"); |
|
348 |
word = word.replace("Tunesia", "Tunisia"); |
|
349 |
word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo"); |
|
350 |
word = word.replace("Transvaal", "Northern Provinces"); |
|
351 |
word = word.replace("Tucum\u00E1n", "Tucuman"); |
|
352 |
// if (!word.contains("Netherlands")){ |
|
353 |
// |
|
354 |
// } |
|
355 |
|
|
356 |
// unknownAreas.add("Baltic amber"); |
|
357 |
// unknownAreas.add("Arabia"); |
|
358 |
|
|
359 |
for (String stopWord : stopWords){ |
|
360 |
if (stopWord.equals(word)){ |
|
361 |
System.out.println(" STOP: " + word); |
|
362 |
return ""; |
|
363 |
} |
|
364 |
} |
|
365 |
for (String unknownArea : unknownAreas){ |
|
366 |
if (unknownArea.equals(word)){ |
|
367 |
System.out.println(" UNKNOWN: " + word); |
|
368 |
return ""; |
|
369 |
} |
|
370 |
} |
|
371 |
for (String higherArea : higherAreas){ |
|
372 |
if (higherArea.equals(word)){ |
|
373 |
return ""; |
|
374 |
} |
|
375 |
} |
|
376 |
|
|
377 |
//higher regions |
|
378 |
|
|
379 |
return word; |
|
380 |
} |
|
381 |
|
|
382 |
private void initStopWords(){ |
|
383 |
stopWords.add("and"); |
|
384 |
stopWords.add("Is"); |
|
385 |
stopWords.add("Is."); |
|
386 |
stopWords.add("Islands"); |
|
387 |
stopWords.add("Island"); |
|
388 |
|
|
389 |
stopWords.add("of"); |
|
390 |
stopWords.add("areas"); |
|
391 |
stopWords.add("USA"); |
|
392 |
stopWords.add("Australia"); //except for Australia only |
|
393 |
stopWords.add("Argentina"); |
|
394 |
|
|
395 |
//unknownAreas.add("Panama"); |
|
396 |
unknownAreas.add("South Africa"); |
|
397 |
unknownAreas.add("Chile"); |
|
398 |
|
|
399 |
unknownAreas.add("Baltic amber"); |
|
400 |
unknownAreas.add("Arabia"); |
|
401 |
|
|
402 |
|
|
403 |
higherAreas.add("AF"); |
|
404 |
higherAreas.add("OR"); |
|
405 |
higherAreas.add("PA"); |
|
406 |
higherAreas.add("AU"); |
|
407 |
higherAreas.add("NE"); |
|
408 |
|
|
409 |
higherAreas.add("NT"); |
|
410 |
} |
|
411 |
|
|
412 |
|
|
413 |
/** |
|
414 |
* @param args |
|
415 |
*/ |
|
416 |
public static void main(String[] args) { |
|
417 |
CdmApplicationController app = null; |
|
418 |
DbSchemaValidation val = DbSchemaValidation.UPDATE; |
|
419 |
app = CdmApplicationController.NewInstance(cdmDestination, val); |
|
420 |
|
|
421 |
DipteraDistributionParser dipDist = new DipteraDistributionParser(); |
|
422 |
if (app != null){ |
|
423 |
dipDist.doDistribution(app); |
|
424 |
}else{ |
|
425 |
logger.warn("No Application Context"); |
|
426 |
} |
|
427 |
} |
|
428 |
} |
|
1 |
/** |
|
2 |
* Copyright (C) 2007 EDIT |
|
3 |
* European Distributed Institute of Taxonomy |
|
4 |
* http://www.e-taxonomy.eu |
|
5 |
* |
|
6 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
7 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
8 |
*/ |
|
9 |
|
|
10 |
/** |
|
11 |
* Copyright (C) 2007 EDIT |
|
12 |
* European Distributed Institute of Taxonomy |
|
13 |
* http://www.e-taxonomy.eu |
|
14 |
* |
|
15 |
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
16 |
* See LICENSE.TXT at the top of this package for the full license terms. |
|
17 |
*/ |
|
18 |
package eu.etaxonomy.cdm.app.wp6.diptera; |
|
19 |
|
|
20 |
import java.util.ArrayList; |
|
21 |
import java.util.HashSet; |
|
22 |
import java.util.List; |
|
23 |
import java.util.Set; |
|
24 |
import java.util.regex.Pattern; |
|
25 |
|
|
26 |
import org.apache.log4j.Logger; |
|
27 |
import org.springframework.transaction.TransactionStatus; |
|
28 |
|
|
29 |
import eu.etaxonomy.cdm.api.application.CdmApplicationController; |
|
30 |
import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration; |
|
31 |
import eu.etaxonomy.cdm.app.common.CdmDestinations; |
|
32 |
import eu.etaxonomy.cdm.database.DbSchemaValidation; |
|
33 |
import eu.etaxonomy.cdm.database.ICdmDataSource; |
|
34 |
import eu.etaxonomy.cdm.io.common.TdwgAreaProvider; |
|
35 |
import eu.etaxonomy.cdm.model.common.Language; |
|
36 |
import eu.etaxonomy.cdm.model.description.DescriptionBase; |
|
37 |
import eu.etaxonomy.cdm.model.description.DescriptionElementBase; |
|
38 |
import eu.etaxonomy.cdm.model.description.Distribution; |
|
39 |
import eu.etaxonomy.cdm.model.description.Feature; |
|
40 |
import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm; |
|
41 |
import eu.etaxonomy.cdm.model.description.TaxonDescription; |
|
42 |
import eu.etaxonomy.cdm.model.description.TextData; |
|
43 |
import eu.etaxonomy.cdm.model.location.NamedArea; |
|
44 |
import eu.etaxonomy.cdm.model.taxon.Taxon; |
|
45 |
import eu.etaxonomy.cdm.model.taxon.TaxonBase; |
|
46 |
|
|
47 |
/** |
|
48 |
* @author a.mueller |
|
49 |
* @created 17.10.2008 |
|
50 |
* @version 1.0 |
|
51 |
*/ |
|
52 |
public class DipteraDistributionParser { |
|
53 |
private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class); |
|
54 |
|
|
55 |
private static ICdmDataSource cdmDestination = CdmDestinations.localH2(); |
|
56 |
|
|
57 |
final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' ) |
|
58 |
static Pattern pattern = null; |
|
59 |
|
|
60 |
protected void doDistribution(ICdmApplicationConfiguration app){ |
|
61 |
pattern = Pattern.compile(epiSplitter); |
|
62 |
TransactionStatus txStatus = app.startTransaction(); |
|
63 |
List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null); |
|
64 |
for (TaxonBase taxon: taxa ){ |
|
65 |
if (taxon instanceof Taxon){ |
|
66 |
// unlazyDescription(app, (Taxon)taxon); |
|
67 |
Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions(); |
|
68 |
for (DescriptionBase description: descriptions){ |
|
69 |
Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>(); |
|
70 |
descElements.addAll(description.getElements()); |
|
71 |
|
|
72 |
for (DescriptionElementBase descEl: descElements){ |
|
73 |
if (descEl.getFeature().equals(Feature.OCCURRENCE())){ |
|
74 |
if (descEl instanceof TextData){ |
|
75 |
String occString = ((TextData)descEl).getText(Language.ENGLISH()); |
|
76 |
parseOccurenceString(occString, description); |
|
77 |
//app.getTaxonService().saveTaxon(taxon); |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
} |
|
83 |
} |
|
84 |
System.out.println("Unknowns: "); |
|
85 |
for (String unknown: unrekognizedStrings){ |
|
86 |
System.out.println(unknown); |
|
87 |
} |
|
88 |
System.out.println("Distributions not recognized: " + countNot); |
|
89 |
System.out.println("Distributions created: " + countYes); |
|
90 |
app.commitTransaction(txStatus); |
|
91 |
} |
|
92 |
|
|
93 |
static Set<String> unrekognizedStrings = new HashSet<String>(); |
|
94 |
static int countNot = 0; |
|
95 |
static int countYes = 0; |
|
96 |
|
|
97 |
private void parseOccurenceString(String occString, DescriptionBase desc){ |
|
98 |
System.out.println(occString); |
|
99 |
if (occString != null){ |
|
100 |
String[] words = pattern.split(occString); |
|
101 |
int i = 0; |
|
102 |
int countSkip = 0; |
|
103 |
for (String word: words){ |
|
104 |
if (word.contains("U.S.A")){ |
|
105 |
logger.warn("U.S.A."); |
|
106 |
} |
|
107 |
boolean isDoubtful = false; |
|
108 |
if (countSkip > 0){ |
|
109 |
countSkip--; |
|
110 |
}else if(word.trim().length() == 0){ |
|
111 |
//skip |
|
112 |
}else{ |
|
113 |
if (word.endsWith(":") && word.length()<=4){ |
|
114 |
//Higher area |
|
115 |
//TODO |
|
116 |
}else{ |
|
117 |
word = word.trim(); |
|
118 |
if (word.contains("?")){ |
|
119 |
isDoubtful = true; |
|
120 |
word = word.replace("?", ""); |
|
121 |
} |
|
122 |
word = adaptWordsToTdwg(word); |
|
123 |
|
|
124 |
if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){ |
|
125 |
for (countSkip = 1; countSkip <= 6; countSkip++){ |
|
126 |
word = word.trim(); |
|
127 |
if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){ |
|
128 |
if (words.length > i + countSkip){ |
|
129 |
word = word + " " + words[i + countSkip]; |
|
130 |
} |
|
131 |
if (word.contains("?")){ |
|
132 |
isDoubtful = true; |
|
133 |
word = word.replace("?", ""); |
|
134 |
} |
|
135 |
word = adaptWordsToTdwg(word); |
|
136 |
if ("".equals(word)){ |
|
137 |
break; |
|
138 |
} |
|
139 |
}else{ |
|
140 |
break; |
|
141 |
} |
|
142 |
} |
|
143 |
} |
|
144 |
if ("".equals(word)){ |
|
145 |
//countSkip = countSkip; |
|
146 |
}else if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){ |
|
147 |
if (word.contains("?")){ |
|
148 |
logger.warn("XXX"); |
|
149 |
} |
|
150 |
countNot++; |
|
151 |
System.out.println(" False:" + countNot + ": " + word); |
|
152 |
unrekognizedStrings.add(word); |
|
153 |
countSkip = 0; |
|
154 |
}else{ |
|
155 |
if (word.equals("Netherlands")){ |
|
156 |
if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){ |
|
157 |
word = "Netherlands Antilles"; |
|
158 |
countSkip=2; |
|
159 |
} |
|
160 |
} |
|
161 |
PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT(); |
|
162 |
if (isDoubleArea(word)){ |
|
163 |
NamedArea[] doubleArea = getDoubleArea(word); |
|
164 |
for (NamedArea area : doubleArea){ |
|
165 |
Distribution distr = Distribution.NewInstance(area, term); |
|
166 |
desc.addElement(distr); |
|
167 |
} |
|
168 |
}else{ |
|
169 |
NamedArea area; |
|
170 |
if (TdwgAreaProvider.isTdwgAreaLabel(word)){ |
|
171 |
area = TdwgAreaProvider.getAreaByTdwgLabel(word); |
|
172 |
}else{ |
|
173 |
area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word); |
|
174 |
} |
|
175 |
if (isDoubtful){ |
|
176 |
term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE(); |
|
177 |
} |
|
178 |
Distribution distr = Distribution.NewInstance(area, term); |
|
179 |
desc.addElement(distr); |
|
180 |
} |
|
181 |
countYes++; |
|
182 |
System.out.println(" True:" + countYes + ": " + word); |
|
183 |
countSkip--; |
|
184 |
} |
|
185 |
} |
|
186 |
} |
|
187 |
i++; |
|
188 |
} |
|
189 |
} |
|
190 |
} |
|
191 |
|
|
192 |
private boolean isDoubleArea(String word){ |
|
193 |
if ("Canary and Madeira Is.".equalsIgnoreCase(word) || |
|
194 |
"southern Europe".equalsIgnoreCase(word) || |
|
195 |
"former USSR: North and Central European territory".equalsIgnoreCase(word) |
|
196 |
){ |
|
197 |
return true; |
|
198 |
}else{ |
|
199 |
return false; |
|
200 |
} |
|
201 |
} |
|
202 |
|
|
203 |
private NamedArea[] getDoubleArea(String word){ |
|
204 |
NamedArea[] result = new NamedArea[2]; |
|
205 |
if ("Canary and Madeira Is.".equalsIgnoreCase(word)){ |
|
206 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY"); |
|
207 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR"); |
|
208 |
}else if ("southern Europe".equalsIgnoreCase(word)){ |
|
209 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12"); |
|
210 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13"); |
|
211 |
}else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){ |
|
212 |
result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO"); |
|
213 |
result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO"); |
|
214 |
}else{ |
|
215 |
logger.warn("Double area not recognized"); |
|
216 |
} |
|
217 |
return result; |
|
218 |
} |
|
219 |
|
|
220 |
|
|
221 |
static List<String> stopWords = new ArrayList<String>(); |
|
222 |
static List<String> unknownAreas = new ArrayList<String>(); |
|
223 |
static List<String> higherAreas = new ArrayList<String>(); |
|
224 |
|
|
225 |
private String adaptWordsToTdwg(String word){ |
|
226 |
word = word.replace(",", "").replace(";", ""); |
|
227 |
if (! word.contains("U.S.A")){ |
|
228 |
word = word.replace(",", "").replace(".", "").replace(";", ""); |
|
229 |
}else{ |
|
230 |
word = word.replace(",", "").replace(";", ""); |
|
231 |
} |
|
232 |
|
|
233 |
word = word.trim(); |
|
234 |
if (word.endsWith("Is")){ |
|
235 |
word = word + "."; |
|
236 |
} |
|
237 |
if (stopWords.size() == 0){ |
|
238 |
initStopWords(); |
|
239 |
} |
|
240 |
|
|
241 |
word = word.replace("Russia [North European territory]", "North European Russia"); |
|
242 |
word = word.replace("Russia North European territory", "North European Russia"); |
|
243 |
word = word.replace("Russia: North European territory", "North European Russia"); |
|
244 |
word = word.replace("Russia: North European territory", "North European Russia"); |
|
245 |
|
|
246 |
word = word.replace("Amber", "amber"); |
|
247 |
|
|
248 |
|
|
249 |
word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is."); |
|
250 |
//or word = word.replace("Prince Edward Is.", "Prince Edward I."); |
|
251 |
word = word.replace("Bahama Is.", "Bahamas"); |
|
252 |
word = word.replace("Comores Is.", "Comoros"); |
|
253 |
word = word.replace("former Yugoslavia", "Yugoslavia"); |
|
254 |
word = word.replace("former Czechoslovakia", "Czechoslovakia"); |
|
255 |
word = word.replace("Rhodesia", "Zimbabwe"); |
|
256 |
word = word.replace("The Gambia", "Gambia, The"); |
|
257 |
|
|
258 |
if (!word.contains("El Salvador")){ |
|
259 |
word = word.replace("Salvador", "El Salvador"); |
|
260 |
} |
|
261 |
word = word.replace("Vera Cruz", "Veracruz"); |
|
262 |
word = word.replace("Turkmenia", "Turkmenistan"); |
|
263 |
word = word.replace("Qu\u00E9beck", "Qu\u00E9bec"); |
|
264 |
word = word.replace("Quebeck", "Qu\u00E9bec"); |
|
265 |
word = word.replace("Quebec", "Qu\u00E9bec"); |
|
266 |
|
|
267 |
if (!word.contains("Gambia, The")){ |
|
268 |
word = word.replace("Gambia", "Gambia, The"); |
|
269 |
} |
|
270 |
word = word.replace("Mariana Is.", "Marianas"); |
|
271 |
word = word.replace("Kenia", "Kenya"); |
|
272 |
word = word.replace("Central Africa", "Central African Republic"); |
|
273 |
word = word.replace("Canal Zone", ""); |
|
274 |
//word = word.replace("Panama", "Panamá"); |
|
275 |
word = word.replace("Panama", "Panam\u00E1"); |
|
276 |
if (! word.contains("New South Wales")){ |
|
277 |
word = word.replace("Wales", "Great Britain"); |
|
278 |
} |
|
279 |
word = word.replace("Java", "Jawa"); |
|
280 |
word = word.replace("former USSR: North European territory", "North European Russia"); |
|
281 |
word = word.replace("former USSR: South European territory", "South European Russia"); |
|
282 |
word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia"); |
|
283 |
|
|
284 |
word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis"); |
|
285 |
|
|
286 |
word = word.replace("oceanian islands", "Pacific"); |
|
287 |
word = word.replace("Ussuri region", "Primorye"); |
|
288 |
word = word.replace("Galapagos Is.", "Gal\u00E1pagos"); |
|
289 |
word = word.replace("Tarapac\u00E1", "Tarapaca"); |
|
290 |
word = word.replace("Reunion", "R\u00E9union"); |
|
291 |
if (! word.contains("Is.")){ |
|
292 |
word = word.replace("Galapagos", "Gal\u00E1pagos"); |
|
293 |
} |
|
294 |
|
|
295 |
//word = word.replace("Galapagos Is.", "Galápagos"); |
|
296 |
if (! word.contains("Peninsular")){ |
|
297 |
word = word.replace("Malaysia", "Peninsular Malaysia"); |
|
298 |
} |
|
299 |
word = word.replace("Polynesic Is.", "South Solomons"); |
|
300 |
|
|
301 |
word = word.replace("Usbek SSR", "Uzbekistan"); |
|
302 |
word = word.replace("Mexican amber", "Mexico"); |
|
303 |
word = word.replace("Marocco", "Morocco"); |
|
304 |
if (! word.contains("Tobago")){ |
|
305 |
word = word.replace("Trinidad", "Trinidad-Tobago"); |
|
306 |
} |
|
307 |
if (! word.contains("Trinidad")){ |
|
308 |
word = word.replace("Tobago", "Trinidad-Tobago"); |
|
309 |
} |
|
310 |
word = word.replace("Haiti", "Haiti"); |
|
311 |
word = word.replace("Moluccas", "Maluku"); |
|
312 |
word = word.replace("Belau", "Palau"); |
|
313 |
word = word.replace("Dominican amber", "Dominican Republic"); |
|
314 |
if (! word.contains("Russian")){ |
|
315 |
word = word.replace("Far East", "Russian Far East"); |
|
316 |
} |
|
317 |
word = word.replace("Tahiti", "Society Is."); |
|
318 |
word = word.replace("Iraque", "Iraq"); |
|
319 |
word = word.replace("Wake Island", "Wake I."); |
|
320 |
if (! word.contains("I.")){ |
|
321 |
word = word.replace("Johnston I", "Johnston I."); |
|
322 |
word = word.replace("Wake I", "Wake I."); |
|
323 |
word = word.replace("Clipperton I", "Clipperton I."); |
|
324 |
} |
|
325 |
if (! word.contains("Provinces")){ |
|
326 |
word = word.replace("Cape Province", "Cape Provinces"); |
|
327 |
} |
|
328 |
word = word.replace("Eastern Cape Provinces", "Eastern Cape Province"); |
|
329 |
word = word.replace("Western Cape Provinces", "Western Cape Province"); |
|
330 |
if (! word.contains("Barbuda")){ |
|
331 |
word = word.replace("Antigua", "Antigua-Barbuda"); |
|
332 |
} |
|
333 |
if (! word.contains("St.")){ |
|
334 |
word = word.replace("St Vincent", "St.Vincent"); |
|
335 |
word = word.replace("St Lucia", "St.Lucia"); |
|
336 |
word = word.replace("St Helena", "St.Helena"); |
|
337 |
} |
|
338 |
word = word.replace("Asia-tropical", "Asia-Tropical"); |
|
339 |
word = word.replace("Society Islands", "Society Is."); |
|
340 |
word = word.replace("Virgin Islands", "Virgin Is."); |
|
341 |
word = word.replace("Canary Islands", "Canary Is."); |
|
342 |
word = word.replace("Rhode Island", "Rhode I."); |
|
343 |
|
|
344 |
|
|
345 |
word = word.replace("Rodriguez", "Rodrigues"); |
|
346 |
word = word.replace("British Colombia", "British Columbia"); |
|
347 |
word = word.replace("Bermudas", "Bermuda"); |
|
348 |
word = word.replace("Tunesia", "Tunisia"); |
|
349 |
word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo"); |
|
350 |
word = word.replace("Transvaal", "Northern Provinces"); |
|
351 |
word = word.replace("Tucum\u00E1n", "Tucuman"); |
|
352 |
// if (!word.contains("Netherlands")){ |
|
353 |
// |
|
354 |
// } |
|
355 |
|
|
356 |
// unknownAreas.add("Baltic amber"); |
|
357 |
// unknownAreas.add("Arabia"); |
|
358 |
|
|
359 |
for (String stopWord : stopWords){ |
|
360 |
if (stopWord.equals(word)){ |
|
361 |
System.out.println(" STOP: " + word); |
|
362 |
return ""; |
|
363 |
} |
|
364 |
} |
|
365 |
for (String unknownArea : unknownAreas){ |
|
366 |
if (unknownArea.equals(word)){ |
|
367 |
System.out.println(" UNKNOWN: " + word); |
|
368 |
return ""; |
|
369 |
} |
|
370 |
} |
|
371 |
for (String higherArea : higherAreas){ |
|
372 |
if (higherArea.equals(word)){ |
|
373 |
return ""; |
|
374 |
} |
|
375 |
} |
|
376 |
|
|
377 |
//higher regions |
|
378 |
|
|
379 |
return word; |
|
380 |
} |
|
381 |
|
|
382 |
private void initStopWords(){ |
|
383 |
stopWords.add("and"); |
|
384 |
stopWords.add("Is"); |
|
385 |
stopWords.add("Is."); |
|
386 |
stopWords.add("Islands"); |
|
387 |
stopWords.add("Island"); |
|
388 |
|
|
389 |
stopWords.add("of"); |
|
390 |
stopWords.add("areas"); |
|
391 |
stopWords.add("USA"); |
|
392 |
stopWords.add("Australia"); //except for Australia only |
|
393 |
stopWords.add("Argentina"); |
|
394 |
|
|
395 |
//unknownAreas.add("Panama"); |
|
396 |
unknownAreas.add("South Africa"); |
|
397 |
unknownAreas.add("Chile"); |
|
398 |
|
|
399 |
unknownAreas.add("Baltic amber"); |
|
400 |
unknownAreas.add("Arabia"); |
|
401 |
|
|
402 |
|
|
403 |
higherAreas.add("AF"); |
|
404 |
higherAreas.add("OR"); |
|
405 |
higherAreas.add("PA"); |
|
406 |
higherAreas.add("AU"); |
|
407 |
higherAreas.add("NE"); |
|
408 |
|
|
409 |
higherAreas.add("NT"); |
|
410 |
} |
|
411 |
|
|
412 |
|
|
413 |
/** |
|
414 |
* @param args |
|
415 |
*/ |
|
416 |
public static void main(String[] args) { |
|
417 |
CdmApplicationController app = null; |
|
418 |
DbSchemaValidation val = DbSchemaValidation.UPDATE; |
|
419 |
app = CdmApplicationController.NewInstance(cdmDestination, val); |
|
420 |
|
|
421 |
DipteraDistributionParser dipDist = new DipteraDistributionParser(); |
|
422 |
if (app != null){ |
|
423 |
dipDist.doDistribution(app); |
|
424 |
}else{ |
|
425 |
logger.warn("No Application Context"); |
|
426 |
} |
|
427 |
} |
|
428 |
} |
Also available in: Unified diff
Remove local diptera db from destinations