latest DwcA improvements
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / wp6 / diptera / DipteraDistributionParser.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 /**
11 * Copyright (C) 2007 EDIT
12 * European Distributed Institute of Taxonomy
13 * http://www.e-taxonomy.eu
14 *
15 * The contents of this file are subject to the Mozilla Public License Version 1.1
16 * See LICENSE.TXT at the top of this package for the full license terms.
17 */
18 package eu.etaxonomy.cdm.app.wp6.diptera;
19
20 import java.util.ArrayList;
21 import java.util.HashSet;
22 import java.util.List;
23 import java.util.Set;
24 import java.util.regex.Pattern;
25
26 import org.apache.log4j.Logger;
27 import org.springframework.transaction.TransactionStatus;
28
29 import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30 import eu.etaxonomy.cdm.app.common.CdmDestinations;
31 import eu.etaxonomy.cdm.database.DbSchemaValidation;
32 import eu.etaxonomy.cdm.database.ICdmDataSource;
33 import eu.etaxonomy.cdm.model.common.Language;
34 import eu.etaxonomy.cdm.model.description.DescriptionBase;
35 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
36 import eu.etaxonomy.cdm.model.description.Distribution;
37 import eu.etaxonomy.cdm.model.description.Feature;
38 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
39 import eu.etaxonomy.cdm.model.description.PresenceTerm;
40 import eu.etaxonomy.cdm.model.description.TaxonDescription;
41 import eu.etaxonomy.cdm.model.description.TextData;
42 import eu.etaxonomy.cdm.model.location.NamedArea;
43 import eu.etaxonomy.cdm.model.location.TdwgArea;
44 import eu.etaxonomy.cdm.model.taxon.Taxon;
45 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46
47 /**
48 * @author a.mueller
49 * @created 17.10.2008
50 * @version 1.0
51 */
52 public class DipteraDistributionParser {
53 private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
54
55 private static ICdmDataSource cdmDestination = CdmDestinations.cdm_edit_diptera_a();
56
57 final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
58 static Pattern pattern = null;
59
60 protected void doDistribution(CdmApplicationController app){
61 pattern = Pattern.compile(epiSplitter);
62 TransactionStatus txStatus = app.startTransaction();
63 List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
64 for (TaxonBase taxon: taxa ){
65 if (taxon instanceof Taxon){
66 // unlazyDescription(app, (Taxon)taxon);
67 Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
68 for (DescriptionBase description: descriptions){
69 Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
70 descElements.addAll(description.getElements());
71
72 for (DescriptionElementBase descEl: descElements){
73 if (descEl.getFeature().equals(Feature.OCCURRENCE())){
74 if (descEl instanceof TextData){
75 String occString = ((TextData)descEl).getText(Language.ENGLISH());
76 parseOccurenceString(occString, description);
77 //app.getTaxonService().saveTaxon(taxon);
78 }
79 }
80 }
81 }
82 }
83 }
84 System.out.println("Unknowns: ");
85 for (String unknown: unrekognizedStrings){
86 System.out.println(unknown);
87 }
88 System.out.println("Distributions not recognized: " + countNot);
89 System.out.println("Distributions created: " + countYes);
90 app.commitTransaction(txStatus);
91 }
92
93 static Set<String> unrekognizedStrings = new HashSet<String>();
94 static int countNot = 0;
95 static int countYes = 0;
96
97 private void parseOccurenceString(String occString, DescriptionBase desc){
98 System.out.println(occString);
99 if (occString != null){
100 String[] words = pattern.split(occString);
101 int i = 0;
102 int countSkip = 0;
103 for (String word: words){
104 if (word.contains("U.S.A")){
105 logger.warn("U.S.A.");
106 }
107 boolean isDoubtful = false;
108 if (countSkip > 0){
109 countSkip--;
110 }else if(word.trim().length() == 0){
111 //skip
112 }else{
113 if (word.endsWith(":") && word.length()<=4){
114 //Higher area
115 //TODO
116 }else{
117 word = word.trim();
118 if (word.contains("?")){
119 isDoubtful = true;
120 word = word.replace("?", "");
121 }
122 word = adaptWordsToTdwg(word);
123
124 if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
125 for (countSkip = 1; countSkip <= 6; countSkip++){
126 word = word.trim();
127 if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
128 if (words.length > i + countSkip){
129 word = word + " " + words[i + countSkip];
130 }
131 if (word.contains("?")){
132 isDoubtful = true;
133 word = word.replace("?", "");
134 }
135 word = adaptWordsToTdwg(word);
136 if ("".equals(word)){
137 break;
138 }
139 }else{
140 break;
141 }
142 }
143 }
144 if ("".equals(word)){
145 //countSkip = countSkip;
146 }else if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){
147 if (word.contains("?")){
148 logger.warn("XXX");
149 }
150 countNot++;
151 System.out.println(" False:" + countNot + ": " + word);
152 unrekognizedStrings.add(word);
153 countSkip = 0;
154 }else{
155 if (word.equals("Netherlands")){
156 if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
157 word = "Netherlands Antilles";
158 countSkip=2;
159 }
160 }
161 PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();
162 if (isDoubleArea(word)){
163 NamedArea[] doubleArea = getDoubleArea(word);
164 for (NamedArea area : doubleArea){
165 Distribution distr = Distribution.NewInstance(area, term);
166 desc.addElement(distr);
167 }
168 }else{
169 NamedArea area;
170 if (TdwgArea.isTdwgAreaLabel(word)){
171 area = TdwgArea.getAreaByTdwgLabel(word);
172 }else{
173 area = TdwgArea.getAreaByTdwgAbbreviation(word);
174 }
175 if (isDoubtful){
176 term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
177 }
178 Distribution distr = Distribution.NewInstance(area, term);
179 desc.addElement(distr);
180 }
181 countYes++;
182 System.out.println(" True:" + countYes + ": " + word);
183 countSkip--;
184 }
185 }
186 }
187 i++;
188 }
189 }
190 }
191
192 private boolean isDoubleArea(String word){
193 if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
194 "southern Europe".equalsIgnoreCase(word) ||
195 "former USSR: North and Central European territory".equalsIgnoreCase(word)
196 ){
197 return true;
198 }else{
199 return false;
200 }
201 }
202
203 private NamedArea[] getDoubleArea(String word){
204 NamedArea[] result = new NamedArea[2];
205 if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
206 result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");
207 result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");
208 }else if ("southern Europe".equalsIgnoreCase(word)){
209 result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");
210 result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");
211 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
212 result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");
213 result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");
214 }else{
215 logger.warn("Double area not recognized");
216 }
217 return result;
218 }
219
220
221 static List<String> stopWords = new ArrayList<String>();
222 static List<String> unknownAreas = new ArrayList<String>();
223 static List<String> higherAreas = new ArrayList<String>();
224
225 private String adaptWordsToTdwg(String word){
226 word = word.replace(",", "").replace(";", "");
227 if (! word.contains("U.S.A")){
228 word = word.replace(",", "").replace(".", "").replace(";", "");
229 }else{
230 word = word.replace(",", "").replace(";", "");
231 }
232
233 word = word.trim();
234 if (word.endsWith("Is")){
235 word = word + ".";
236 }
237 if (stopWords.size() == 0){
238 initStopWords();
239 }
240
241 word = word.replace("Russia [North European territory]", "North European Russia");
242 word = word.replace("Russia North European territory", "North European Russia");
243 word = word.replace("Russia: North European territory", "North European Russia");
244 word = word.replace("Russia: North European territory", "North European Russia");
245
246 word = word.replace("Amber", "amber");
247
248
249 word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
250 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
251 word = word.replace("Bahama Is.", "Bahamas");
252 word = word.replace("Comores Is.", "Comoros");
253 word = word.replace("former Yugoslavia", "Yugoslavia");
254 word = word.replace("former Czechoslovakia", "Czechoslovakia");
255 word = word.replace("Rhodesia", "Zimbabwe");
256 word = word.replace("The Gambia", "Gambia, The");
257
258 if (!word.contains("El Salvador")){
259 word = word.replace("Salvador", "El Salvador");
260 }
261 word = word.replace("Vera Cruz", "Veracruz");
262 word = word.replace("Turkmenia", "Turkmenistan");
263 word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
264 word = word.replace("Quebeck", "Qu\u00E9bec");
265 word = word.replace("Quebec", "Qu\u00E9bec");
266
267 if (!word.contains("Gambia, The")){
268 word = word.replace("Gambia", "Gambia, The");
269 }
270 word = word.replace("Mariana Is.", "Marianas");
271 word = word.replace("Kenia", "Kenya");
272 word = word.replace("Central Africa", "Central African Republic");
273 word = word.replace("Canal Zone", "");
274 //word = word.replace("Panama", "Panamá");
275 word = word.replace("Panama", "Panam\u00E1");
276 if (! word.contains("New South Wales")){
277 word = word.replace("Wales", "Great Britain");
278 }
279 word = word.replace("Java", "Jawa");
280 word = word.replace("former USSR: North European territory", "North European Russia");
281 word = word.replace("former USSR: South European territory", "South European Russia");
282 word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
283
284 word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
285
286 word = word.replace("oceanian islands", "Pacific");
287 word = word.replace("Ussuri region", "Primorye");
288 word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
289 word = word.replace("Tarapac\u00E1", "Tarapaca");
290 word = word.replace("Reunion", "R\u00E9union");
291 if (! word.contains("Is.")){
292 word = word.replace("Galapagos", "Gal\u00E1pagos");
293 }
294
295 //word = word.replace("Galapagos Is.", "Galápagos");
296 if (! word.contains("Peninsular")){
297 word = word.replace("Malaysia", "Peninsular Malaysia");
298 }
299 word = word.replace("Polynesic Is.", "South Solomons");
300
301 word = word.replace("Usbek SSR", "Uzbekistan");
302 word = word.replace("Mexican amber", "Mexico");
303 word = word.replace("Marocco", "Morocco");
304 if (! word.contains("Tobago")){
305 word = word.replace("Trinidad", "Trinidad-Tobago");
306 }
307 if (! word.contains("Trinidad")){
308 word = word.replace("Tobago", "Trinidad-Tobago");
309 }
310 word = word.replace("Haiti", "Haiti");
311 word = word.replace("Moluccas", "Maluku");
312 word = word.replace("Belau", "Palau");
313 word = word.replace("Dominican amber", "Dominican Republic");
314 if (! word.contains("Russian")){
315 word = word.replace("Far East", "Russian Far East");
316 }
317 word = word.replace("Tahiti", "Society Is.");
318 word = word.replace("Iraque", "Iraq");
319 word = word.replace("Wake Island", "Wake I.");
320 if (! word.contains("I.")){
321 word = word.replace("Johnston I", "Johnston I.");
322 word = word.replace("Wake I", "Wake I.");
323 word = word.replace("Clipperton I", "Clipperton I.");
324 }
325 if (! word.contains("Provinces")){
326 word = word.replace("Cape Province", "Cape Provinces");
327 }
328 word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
329 word = word.replace("Western Cape Provinces", "Western Cape Province");
330 if (! word.contains("Barbuda")){
331 word = word.replace("Antigua", "Antigua-Barbuda");
332 }
333 if (! word.contains("St.")){
334 word = word.replace("St Vincent", "St.Vincent");
335 word = word.replace("St Lucia", "St.Lucia");
336 word = word.replace("St Helena", "St.Helena");
337 }
338 word = word.replace("Asia-tropical", "Asia-Tropical");
339 word = word.replace("Society Islands", "Society Is.");
340 word = word.replace("Virgin Islands", "Virgin Is.");
341 word = word.replace("Canary Islands", "Canary Is.");
342 word = word.replace("Rhode Island", "Rhode I.");
343
344
345 word = word.replace("Rodriguez", "Rodrigues");
346 word = word.replace("British Colombia", "British Columbia");
347 word = word.replace("Bermudas", "Bermuda");
348 word = word.replace("Tunesia", "Tunisia");
349 word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
350 word = word.replace("Transvaal", "Northern Provinces");
351 word = word.replace("Tucum\u00E1n", "Tucuman");
352 // if (!word.contains("Netherlands")){
353 //
354 // }
355
356 // unknownAreas.add("Baltic amber");
357 // unknownAreas.add("Arabia");
358
359 for (String stopWord : stopWords){
360 if (stopWord.equals(word)){
361 System.out.println(" STOP: " + word);
362 return "";
363 }
364 }
365 for (String unknownArea : unknownAreas){
366 if (unknownArea.equals(word)){
367 System.out.println(" UNKNOWN: " + word);
368 return "";
369 }
370 }
371 for (String higherArea : higherAreas){
372 if (higherArea.equals(word)){
373 return "";
374 }
375 }
376
377 //higher regions
378
379 return word;
380 }
381
382 private void initStopWords(){
383 stopWords.add("and");
384 stopWords.add("Is");
385 stopWords.add("Is.");
386 stopWords.add("Islands");
387 stopWords.add("Island");
388
389 stopWords.add("of");
390 stopWords.add("areas");
391 stopWords.add("USA");
392 stopWords.add("Australia"); //except for Australia only
393 stopWords.add("Argentina");
394
395 //unknownAreas.add("Panama");
396 unknownAreas.add("South Africa");
397 unknownAreas.add("Chile");
398
399 unknownAreas.add("Baltic amber");
400 unknownAreas.add("Arabia");
401
402
403 higherAreas.add("AF");
404 higherAreas.add("OR");
405 higherAreas.add("PA");
406 higherAreas.add("AU");
407 higherAreas.add("NE");
408
409 higherAreas.add("NT");
410 }
411
412
413 /**
414 * @param args
415 */
416 public static void main(String[] args) {
417 CdmApplicationController app = null;
418 DbSchemaValidation val = DbSchemaValidation.UPDATE;
419 app = CdmApplicationController.NewInstance(cdmDestination, val);
420
421 DipteraDistributionParser dipDist = new DipteraDistributionParser();
422 if (app != null){
423 dipDist.doDistribution(app);
424 }else{
425 logger.warn("No Application Context");
426 }
427 }
428 }