Diptera distribution parser
[cdmlib.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / berlinModelImport / DipteraDistributionParser.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.app.berlinModelImport;
11
12 import java.util.ArrayList;
13 import java.util.HashSet;
14 import java.util.List;
15 import java.util.Set;
16 import java.util.regex.Pattern;
17
18 import org.apache.log4j.Logger;
19 import org.springframework.transaction.TransactionStatus;
20
21 import eu.etaxonomy.cdm.api.application.CdmApplicationController;
22 import eu.etaxonomy.cdm.app.common.CdmDestinations;
23 import eu.etaxonomy.cdm.database.DataSourceNotFoundException;
24 import eu.etaxonomy.cdm.database.DbSchemaValidation;
25 import eu.etaxonomy.cdm.database.ICdmDataSource;
26 import eu.etaxonomy.cdm.model.common.Language;
27 import eu.etaxonomy.cdm.model.common.init.TermNotFoundException;
28 import eu.etaxonomy.cdm.model.description.DescriptionBase;
29 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
30 import eu.etaxonomy.cdm.model.description.Distribution;
31 import eu.etaxonomy.cdm.model.description.Feature;
32 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
33 import eu.etaxonomy.cdm.model.description.PresenceTerm;
34 import eu.etaxonomy.cdm.model.description.TaxonDescription;
35 import eu.etaxonomy.cdm.model.description.TextData;
36 import eu.etaxonomy.cdm.model.location.NamedArea;
37 import eu.etaxonomy.cdm.model.location.TdwgArea;
38 import eu.etaxonomy.cdm.model.taxon.Taxon;
39 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
40
41 /**
42 * @author a.mueller
43 * @created 17.10.2008
44 * @version 1.0
45 */
46 public class DipteraDistributionParser {
47 private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
48
49 final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
50 static Pattern pattern = null;
51
52 protected void doDistribution(CdmApplicationController app){
53 pattern = Pattern.compile(epiSplitter);
54 TransactionStatus txStatus = app.startTransaction();
55 List<TaxonBase> taxa = app.getTaxonService().getAllTaxonBases(1000000, 0);
56 for (TaxonBase taxon: taxa ){
57 if (taxon instanceof Taxon){
58 // unlazyDescription(app, (Taxon)taxon);
59 Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
60 for (DescriptionBase description: descriptions){
61 Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
62 descElements.addAll(description.getElements());
63
64 for (DescriptionElementBase descEl: descElements){
65 if (descEl.getFeature().equals(Feature.OCCURRENCE())){
66 if (descEl instanceof TextData){
67 String occString = ((TextData)descEl).getText(Language.ENGLISH());
68 parseOccurenceString(occString, description);
69 //app.getTaxonService().saveTaxon(taxon);
70 }
71 }
72 }
73 }
74 }
75 }
76 System.out.println("Unknowns: ");
77 for (String unknown: unrekognizedStrings){
78 System.out.println(unknown);
79 }
80 System.out.println("Distributions not recognized: " + countNot);
81 System.out.println("Distributions created: " + countYes);
82 app.commitTransaction(txStatus);
83 }
84
85 static Set<String> unrekognizedStrings = new HashSet<String>();
86 static int countNot = 0;
87 static int countYes = 0;
88
89 private void parseOccurenceString(String occString, DescriptionBase desc){
90 System.out.println(occString);
91 if (occString != null){
92 String[] words = pattern.split(occString);
93 int i = 0;
94 int countSkip = 0;
95 for (String word: words){
96 if (word.contains("U.S.A")){
97 logger.warn("U.S.A.");
98 }
99 boolean isDoubtful = false;
100 if (countSkip > 0){
101 countSkip--;
102 }else if(word.trim().length() == 0){
103 //skip
104 }else{
105 if (word.endsWith(":") && word.length()<=4){
106 //Higher area
107 //TODO
108 }else{
109 word = word.trim();
110 if (word.contains("?")){
111 isDoubtful = true;
112 word = word.replace("?", "");
113 }
114 word = adaptWordsToTdwg(word);
115
116 if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
117 for (countSkip = 1; countSkip <= 6; countSkip++){
118 word = word.trim();
119 if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
120 if (words.length > i + countSkip){
121 word = word + " " + words[i + countSkip];
122 }
123 if (word.contains("?")){
124 isDoubtful = true;
125 word = word.replace("?", "");
126 }
127 word = adaptWordsToTdwg(word);
128 if ("".equals(word)){
129 break;
130 }
131 }else{
132 break;
133 }
134 }
135 }
136 if ("".equals(word)){
137 //countSkip = countSkip;
138 }else if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){
139 if (word.contains("?")){
140 logger.warn("XXX");
141 }
142 countNot++;
143 System.out.println(" False:" + countNot + ": " + word);
144 unrekognizedStrings.add(word);
145 countSkip = 0;
146 }else{
147 PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();
148 if (isDoubleArea(word)){
149 NamedArea[] doubleArea = getDoubleArea(word);
150 for (NamedArea area : doubleArea){
151 Distribution distr = Distribution.NewInstance(area, term);
152 desc.addElement(distr);
153 }
154 }else{
155 NamedArea area;
156 if (TdwgArea.isTdwgAreaLabel(word)){
157 area = TdwgArea.getAreaByTdwgLabel(word);
158 }else{
159 area = TdwgArea.getAreaByTdwgAbbreviation(word);
160 }
161 if (isDoubtful){
162 term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
163 }
164 Distribution distr = Distribution.NewInstance(area, term);
165 desc.addElement(distr);
166 }
167 countYes++;
168 System.out.println(" True:" + countYes + ": " + word);
169 countSkip--;
170 }
171 }
172 }
173 i++;
174 }
175 }
176 }
177
178 private boolean isDoubleArea(String word){
179 if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
180 "southern Europe".equalsIgnoreCase(word) ||
181 "former USSR: North and Central European territory".equalsIgnoreCase(word)
182 ){
183 return true;
184 }else{
185 return false;
186 }
187 }
188
189 private NamedArea[] getDoubleArea(String word){
190 NamedArea[] result = new NamedArea[2];
191 if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
192 result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");
193 result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");
194 }else if ("southern Europe".equalsIgnoreCase(word)){
195 result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");
196 result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");
197 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
198 result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");
199 result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");
200 }else{
201 logger.warn("Double area not recognized");
202 }
203 return result;
204 }
205
206
207 static List<String> stopWords = new ArrayList<String>();
208 static List<String> unknownAreas = new ArrayList<String>();
209 static List<String> higherAreas = new ArrayList<String>();
210
211 private String adaptWordsToTdwg(String word){
212 word = word.replace(",", "").replace(";", "");
213 if (! word.contains("U.S.A")){
214 word = word.replace(",", "").replace(".", "").replace(";", "");
215 }else{
216 word = word.replace(",", "").replace(";", "");
217 }
218
219 word = word.trim();
220 if (word.endsWith("Is")){
221 word = word + ".";
222 }
223 if (stopWords.size() == 0){
224 initStopWords();
225 }
226
227 word = word.replace("Russia [North European territory]", "North European Russia");
228 word = word.replace("Russia North European territory", "North European Russia");
229 word = word.replace("Russia: North European territory", "North European Russia");
230 word = word.replace("Russia: North European territory", "North European Russia");
231
232 word = word.replace("Amber", "amber");
233
234
235 word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
236 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
237 word = word.replace("Bahama Is.", "Bahamas");
238 word = word.replace("Comores Is.", "Comoros");
239 word = word.replace("former Yugoslavia", "Yugoslavia");
240 word = word.replace("former Czechoslovakia", "Czechoslovakia");
241 word = word.replace("Rhodesia", "Zimbabwe");
242 if (!word.contains("El Salvador")){
243 word = word.replace("Salvador", "El Salvador");
244 }
245 word = word.replace("Vera Cruz", "Veracruz");
246 word = word.replace("Turkmenia", "Turkmenistan");
247 word = word.replace("Québeck", "Québec");
248 word = word.replace("Quebeck", "Québec");
249 word = word.replace("Quebec", "Québec");
250 //word = word.replace("Quebec", "Qu+®bec");
251 //word = word.replace("Quebec", "Qu├®bec");
252
253 word = word.replace("Gambia", "Gambia, The");
254 word = word.replace("Mariana Is.", "Marianas");
255 word = word.replace("Kenia", "Kenya");
256 word = word.replace("Central Africa", "Central African Republic");
257 word = word.replace("Canal Zone", "");
258 //word = word.replace("Panama", "Panamá");
259 word = word.replace("Panama", "Panamá");
260 if (! word.contains("New South Wales")){
261 word = word.replace("Wales", "Great Britain");
262 }
263 word = word.replace("Java", "Jawa");
264 word = word.replace("former USSR: North European territory", "North European Russia");
265 word = word.replace("former USSR: South European territory", "South European Russia");
266 word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
267
268 word = word.replace("oceanian islands", "Pacific");
269 word = word.replace("Ussuri region", "Primorye");
270 word = word.replace("Galapagos Is.", "Galápagos");
271 if (! word.contains("Is.")){
272 word = word.replace("Galapagos", "Galápagos");
273 }
274
275 //word = word.replace("Galapagos Is.", "Galápagos");
276 if (! word.contains("Peninsular")){
277 word = word.replace("Malaysia", "Peninsular Malaysia");
278 }
279 word = word.replace("Polynesic Is.", "South Solomons");
280
281 word = word.replace("Usbek SSR", "Uzbekistan");
282 word = word.replace("Mexican amber", "Mexico");
283 word = word.replace("Marocco", "Morocco");
284 if (! word.contains("Tobago")){
285 word = word.replace("Trinidad", "Trinidad-Tobago");
286 }
287 if (! word.contains("Trinidad")){
288 word = word.replace("Tobago", "Trinidad-Tobago");
289 }
290 word = word.replace("Haiti", "Haiti");
291 word = word.replace("Moluccas", "Maluku");
292 word = word.replace("Belau", "Palau");
293 word = word.replace("Dominican amber", "Dominican Republic");
294 if (! word.contains("Russian")){
295 word = word.replace("Far East", "Russian Far East");
296 }
297 word = word.replace("Tahiti", "Society Is.");
298 word = word.replace("Iraque", "Iraq");
299 word = word.replace("Wake Island", "Wake I.");
300 if (! word.contains("I.")){
301 word = word.replace("Johnston I", "Johnston I.");
302 word = word.replace("Wake I", "Wake I.");
303 word = word.replace("Clipperton I", "Clipperton I.");
304 }
305 if (! word.contains("Provinces")){
306 word = word.replace("Cape Province", "Cape Provinces");
307 }
308 word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
309 if (! word.contains("Barbuda")){
310 word = word.replace("Antigua", "Antigua-Barbuda");
311 }
312 if (! word.contains("St.")){
313 word = word.replace("St Vincent", "St.Vincent");
314 word = word.replace("St Lucia", "St.Lucia");
315 word = word.replace("St Helena", "St.Helena");
316 }
317 word = word.replace("Asia-tropical", "Asia-Tropical");
318 word = word.replace("Society Islands", "Society Is.");
319 word = word.replace("Virgin Islands", "Virgin Is.");
320 word = word.replace("Canary Islands", "Canary Is.");
321 word = word.replace("Rhode Island", "Rhode I.");
322
323
324 word = word.replace("Rodriguez", "Rodrigues");
325 word = word.replace("British Colombia", "British Columbia");
326 word = word.replace("Bermudas", "Bermuda");
327 word = word.replace("Tunesia", "Tunisia");
328 word = word.replace("Santos São Paulo", "São Paulo");
329 word = word.replace("Transvaal", "Northern Provinces");
330 word = word.replace("Tucumán", "Tucuman");
331
332
333 // unknownAreas.add("Baltic amber");
334 // unknownAreas.add("Arabia");
335
336 for (String stopWord : stopWords){
337 if (stopWord.equals(word)){
338 System.out.println(" STOP: " + word);
339 return "";
340 }
341 }
342 for (String unknownArea : unknownAreas){
343 if (unknownArea.equals(word)){
344 System.out.println(" UNKNOWN: " + word);
345 return "";
346 }
347 }
348 for (String higherArea : higherAreas){
349 if (higherArea.equals(word)){
350 return "";
351 }
352 }
353
354 //higher regions
355
356 return word;
357 }
358
359 private void initStopWords(){
360 stopWords.add("and");
361 stopWords.add("Is");
362 stopWords.add("Is.");
363 stopWords.add("Islands");
364 stopWords.add("Island");
365
366 stopWords.add("of");
367 stopWords.add("areas");
368 stopWords.add("USA");
369 stopWords.add("Australia"); //except for Australia only
370 stopWords.add("Argentina");
371
372 //unknownAreas.add("Panama");
373 unknownAreas.add("South Africa");
374 unknownAreas.add("Chile");
375
376 unknownAreas.add("Baltic amber");
377 unknownAreas.add("Arabia");
378
379
380 higherAreas.add("AF");
381 higherAreas.add("OR");
382 higherAreas.add("PA");
383 higherAreas.add("AU");
384 higherAreas.add("NE");
385
386 higherAreas.add("NT");
387 }
388
389
390 /**
391 * @param args
392 */
393 public static void main(String[] args) {
394 ICdmDataSource cdmDestination = CdmDestinations.cdm_test_andreasM2();
395 CdmApplicationController app = null;
396 try {
397 DbSchemaValidation val = DbSchemaValidation.UPDATE;
398 app = CdmApplicationController.NewInstance(cdmDestination, val);
399 } catch (DataSourceNotFoundException e) {
400 e.printStackTrace();
401 } catch (TermNotFoundException e) {
402 e.printStackTrace();
403 }
404 DipteraDistributionParser dipDist = new DipteraDistributionParser();
405 if (app != null){
406 dipDist.doDistribution(app);
407 }else{
408 logger.warn("No Application Context");
409 }
410 }
411 }