cleanup and add AizoaceaeIdentifierActivator
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / wp6 / diptera / DipteraDistributionParser.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 /**
11 * Copyright (C) 2007 EDIT
12 * European Distributed Institute of Taxonomy
13 * http://www.e-taxonomy.eu
14 *
15 * The contents of this file are subject to the Mozilla Public License Version 1.1
16 * See LICENSE.TXT at the top of this package for the full license terms.
17 */
18 package eu.etaxonomy.cdm.app.wp6.diptera;
19
20 import java.util.ArrayList;
21 import java.util.HashSet;
22 import java.util.List;
23 import java.util.Set;
24 import java.util.regex.Pattern;
25
26 import org.apache.log4j.Logger;
27 import org.springframework.transaction.TransactionStatus;
28
29 import eu.etaxonomy.cdm.api.application.CdmApplicationController;
30 import eu.etaxonomy.cdm.api.application.ICdmRepository;
31 import eu.etaxonomy.cdm.app.common.CdmDestinations;
32 import eu.etaxonomy.cdm.database.DbSchemaValidation;
33 import eu.etaxonomy.cdm.database.ICdmDataSource;
34 import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
35 import eu.etaxonomy.cdm.model.common.Language;
36 import eu.etaxonomy.cdm.model.description.DescriptionBase;
37 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
38 import eu.etaxonomy.cdm.model.description.Distribution;
39 import eu.etaxonomy.cdm.model.description.Feature;
40 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
41 import eu.etaxonomy.cdm.model.description.TaxonDescription;
42 import eu.etaxonomy.cdm.model.description.TextData;
43 import eu.etaxonomy.cdm.model.location.NamedArea;
44 import eu.etaxonomy.cdm.model.taxon.Taxon;
45 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
46
47 /**
48 * @author a.mueller
49 * @since 17.10.2008
50 */
51 public class DipteraDistributionParser {
52 private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
53
54 private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
55
56 final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
57 static Pattern pattern = null;
58
59 protected void doDistribution(ICdmRepository app){
60 pattern = Pattern.compile(epiSplitter);
61 TransactionStatus txStatus = app.startTransaction();
62 List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
63 for (TaxonBase taxon: taxa ){
64 if (taxon instanceof Taxon){
65 // unlazyDescription(app, (Taxon)taxon);
66 Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
67 for (DescriptionBase description: descriptions){
68 Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
69 descElements.addAll(description.getElements());
70
71 for (DescriptionElementBase descEl: descElements){
72 if (descEl.getFeature().equals(Feature.OCCURRENCE())){
73 if (descEl instanceof TextData){
74 String occString = ((TextData)descEl).getText(Language.ENGLISH());
75 parseOccurenceString(occString, description);
76 //app.getTaxonService().saveTaxon(taxon);
77 }
78 }
79 }
80 }
81 }
82 }
83 System.out.println("Unknowns: ");
84 for (String unknown: unrekognizedStrings){
85 System.out.println(unknown);
86 }
87 System.out.println("Distributions not recognized: " + countNot);
88 System.out.println("Distributions created: " + countYes);
89 app.commitTransaction(txStatus);
90 }
91
92 static Set<String> unrekognizedStrings = new HashSet<>();
93 static int countNot = 0;
94 static int countYes = 0;
95
96 private void parseOccurenceString(String occString, DescriptionBase desc){
97 System.out.println(occString);
98 if (occString != null){
99 String[] words = pattern.split(occString);
100 int i = 0;
101 int countSkip = 0;
102 for (String word: words){
103 if (word.contains("U.S.A")){
104 logger.warn("U.S.A.");
105 }
106 boolean isDoubtful = false;
107 if (countSkip > 0){
108 countSkip--;
109 }else if(word.trim().length() == 0){
110 //skip
111 }else{
112 if (word.endsWith(":") && word.length()<=4){
113 //Higher area
114 //TODO
115 }else{
116 word = word.trim();
117 if (word.contains("?")){
118 isDoubtful = true;
119 word = word.replace("?", "");
120 }
121 word = adaptWordsToTdwg(word);
122
123 if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
124 for (countSkip = 1; countSkip <= 6; countSkip++){
125 word = word.trim();
126 if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
127 if (words.length > i + countSkip){
128 word = word + " " + words[i + countSkip];
129 }
130 if (word.contains("?")){
131 isDoubtful = true;
132 word = word.replace("?", "");
133 }
134 word = adaptWordsToTdwg(word);
135 if ("".equals(word)){
136 break;
137 }
138 }else{
139 break;
140 }
141 }
142 }
143 if ("".equals(word)){
144 //countSkip = countSkip;
145 }else if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){
146 if (word.contains("?")){
147 logger.warn("XXX");
148 }
149 countNot++;
150 System.out.println(" False:" + countNot + ": " + word);
151 unrekognizedStrings.add(word);
152 countSkip = 0;
153 }else{
154 if (word.equals("Netherlands")){
155 if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
156 word = "Netherlands Antilles";
157 countSkip=2;
158 }
159 }
160 PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
161 if (isDoubleArea(word)){
162 NamedArea[] doubleArea = getDoubleArea(word);
163 for (NamedArea area : doubleArea){
164 Distribution distr = Distribution.NewInstance(area, term);
165 desc.addElement(distr);
166 }
167 }else{
168 NamedArea area;
169 if (TdwgAreaProvider.isTdwgAreaLabel(word)){
170 area = TdwgAreaProvider.getAreaByTdwgLabel(word);
171 }else{
172 area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
173 }
174 if (isDoubtful){
175 term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
176 }
177 Distribution distr = Distribution.NewInstance(area, term);
178 desc.addElement(distr);
179 }
180 countYes++;
181 System.out.println(" True:" + countYes + ": " + word);
182 countSkip--;
183 }
184 }
185 }
186 i++;
187 }
188 }
189 }
190
191 private boolean isDoubleArea(String word){
192 if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
193 "southern Europe".equalsIgnoreCase(word) ||
194 "former USSR: North and Central European territory".equalsIgnoreCase(word)
195 ){
196 return true;
197 }else{
198 return false;
199 }
200 }
201
202 private NamedArea[] getDoubleArea(String word){
203 NamedArea[] result = new NamedArea[2];
204 if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
205 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
206 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
207 }else if ("southern Europe".equalsIgnoreCase(word)){
208 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
209 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
210 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
211 result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
212 result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
213 }else{
214 logger.warn("Double area not recognized");
215 }
216 return result;
217 }
218
219
220 static List<String> stopWords = new ArrayList<>();
221 static List<String> unknownAreas = new ArrayList<>();
222 static List<String> higherAreas = new ArrayList<>();
223
224 private String adaptWordsToTdwg(String word){
225 word = word.replace(",", "").replace(";", "");
226 if (! word.contains("U.S.A")){
227 word = word.replace(",", "").replace(".", "").replace(";", "");
228 }else{
229 word = word.replace(",", "").replace(";", "");
230 }
231
232 word = word.trim();
233 if (word.endsWith("Is")){
234 word = word + ".";
235 }
236 if (stopWords.size() == 0){
237 initStopWords();
238 }
239
240 word = word.replace("Russia [North European territory]", "North European Russia");
241 word = word.replace("Russia North European territory", "North European Russia");
242 word = word.replace("Russia: North European territory", "North European Russia");
243 word = word.replace("Russia: North European territory", "North European Russia");
244
245 word = word.replace("Amber", "amber");
246
247
248 word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
249 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
250 word = word.replace("Bahama Is.", "Bahamas");
251 word = word.replace("Comores Is.", "Comoros");
252 word = word.replace("former Yugoslavia", "Yugoslavia");
253 word = word.replace("former Czechoslovakia", "Czechoslovakia");
254 word = word.replace("Rhodesia", "Zimbabwe");
255 word = word.replace("The Gambia", "Gambia, The");
256
257 if (!word.contains("El Salvador")){
258 word = word.replace("Salvador", "El Salvador");
259 }
260 word = word.replace("Vera Cruz", "Veracruz");
261 word = word.replace("Turkmenia", "Turkmenistan");
262 word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
263 word = word.replace("Quebeck", "Qu\u00E9bec");
264 word = word.replace("Quebec", "Qu\u00E9bec");
265
266 if (!word.contains("Gambia, The")){
267 word = word.replace("Gambia", "Gambia, The");
268 }
269 word = word.replace("Mariana Is.", "Marianas");
270 word = word.replace("Kenia", "Kenya");
271 word = word.replace("Central Africa", "Central African Republic");
272 word = word.replace("Canal Zone", "");
273 //word = word.replace("Panama", "Panamá");
274 word = word.replace("Panama", "Panam\u00E1");
275 if (! word.contains("New South Wales")){
276 word = word.replace("Wales", "Great Britain");
277 }
278 word = word.replace("Java", "Jawa");
279 word = word.replace("former USSR: North European territory", "North European Russia");
280 word = word.replace("former USSR: South European territory", "South European Russia");
281 word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
282
283 word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
284
285 word = word.replace("oceanian islands", "Pacific");
286 word = word.replace("Ussuri region", "Primorye");
287 word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
288 word = word.replace("Tarapac\u00E1", "Tarapaca");
289 word = word.replace("Reunion", "R\u00E9union");
290 if (! word.contains("Is.")){
291 word = word.replace("Galapagos", "Gal\u00E1pagos");
292 }
293
294 //word = word.replace("Galapagos Is.", "Galápagos");
295 if (! word.contains("Peninsular")){
296 word = word.replace("Malaysia", "Peninsular Malaysia");
297 }
298 word = word.replace("Polynesic Is.", "South Solomons");
299
300 word = word.replace("Usbek SSR", "Uzbekistan");
301 word = word.replace("Mexican amber", "Mexico");
302 word = word.replace("Marocco", "Morocco");
303 if (! word.contains("Tobago")){
304 word = word.replace("Trinidad", "Trinidad-Tobago");
305 }
306 if (! word.contains("Trinidad")){
307 word = word.replace("Tobago", "Trinidad-Tobago");
308 }
309 word = word.replace("Haiti", "Haiti");
310 word = word.replace("Moluccas", "Maluku");
311 word = word.replace("Belau", "Palau");
312 word = word.replace("Dominican amber", "Dominican Republic");
313 if (! word.contains("Russian")){
314 word = word.replace("Far East", "Russian Far East");
315 }
316 word = word.replace("Tahiti", "Society Is.");
317 word = word.replace("Iraque", "Iraq");
318 word = word.replace("Wake Island", "Wake I.");
319 if (! word.contains("I.")){
320 word = word.replace("Johnston I", "Johnston I.");
321 word = word.replace("Wake I", "Wake I.");
322 word = word.replace("Clipperton I", "Clipperton I.");
323 }
324 if (! word.contains("Provinces")){
325 word = word.replace("Cape Province", "Cape Provinces");
326 }
327 word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
328 word = word.replace("Western Cape Provinces", "Western Cape Province");
329 if (! word.contains("Barbuda")){
330 word = word.replace("Antigua", "Antigua-Barbuda");
331 }
332 if (! word.contains("St.")){
333 word = word.replace("St Vincent", "St.Vincent");
334 word = word.replace("St Lucia", "St.Lucia");
335 word = word.replace("St Helena", "St.Helena");
336 }
337 word = word.replace("Asia-tropical", "Asia-Tropical");
338 word = word.replace("Society Islands", "Society Is.");
339 word = word.replace("Virgin Islands", "Virgin Is.");
340 word = word.replace("Canary Islands", "Canary Is.");
341 word = word.replace("Rhode Island", "Rhode I.");
342
343
344 word = word.replace("Rodriguez", "Rodrigues");
345 word = word.replace("British Colombia", "British Columbia");
346 word = word.replace("Bermudas", "Bermuda");
347 word = word.replace("Tunesia", "Tunisia");
348 word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
349 word = word.replace("Transvaal", "Northern Provinces");
350 word = word.replace("Tucum\u00E1n", "Tucuman");
351 // if (!word.contains("Netherlands")){
352 //
353 // }
354
355 // unknownAreas.add("Baltic amber");
356 // unknownAreas.add("Arabia");
357
358 for (String stopWord : stopWords){
359 if (stopWord.equals(word)){
360 System.out.println(" STOP: " + word);
361 return "";
362 }
363 }
364 for (String unknownArea : unknownAreas){
365 if (unknownArea.equals(word)){
366 System.out.println(" UNKNOWN: " + word);
367 return "";
368 }
369 }
370 for (String higherArea : higherAreas){
371 if (higherArea.equals(word)){
372 return "";
373 }
374 }
375
376 //higher regions
377
378 return word;
379 }
380
381 private void initStopWords(){
382 stopWords.add("and");
383 stopWords.add("Is");
384 stopWords.add("Is.");
385 stopWords.add("Islands");
386 stopWords.add("Island");
387
388 stopWords.add("of");
389 stopWords.add("areas");
390 stopWords.add("USA");
391 stopWords.add("Australia"); //except for Australia only
392 stopWords.add("Argentina");
393
394 //unknownAreas.add("Panama");
395 unknownAreas.add("South Africa");
396 unknownAreas.add("Chile");
397
398 unknownAreas.add("Baltic amber");
399 unknownAreas.add("Arabia");
400
401
402 higherAreas.add("AF");
403 higherAreas.add("OR");
404 higherAreas.add("PA");
405 higherAreas.add("AU");
406 higherAreas.add("NE");
407
408 higherAreas.add("NT");
409 }
410
411 public static void main(String[] args) {
412 CdmApplicationController app = null;
413 DbSchemaValidation val = DbSchemaValidation.UPDATE;
414 app = CdmApplicationController.NewInstance(cdmDestination, val);
415
416 DipteraDistributionParser dipDist = new DipteraDistributionParser();
417 if (app != null){
418 dipDist.doDistribution(app);
419 }else{
420 logger.warn("No Application Context");
421 }
422 }
423 }