5ff08888b264ee1ba732dc59580580ad0f2d30a0
[cdmlib.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / berlinModelImport / DipteraDistributionParser.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.app.berlinModelImport;
11
12 import java.util.ArrayList;
13 import java.util.HashSet;
14 import java.util.List;
15 import java.util.Set;
16 import java.util.regex.Pattern;
17
18 import org.apache.log4j.Logger;
19 import org.springframework.transaction.TransactionStatus;
20
21 import eu.etaxonomy.cdm.api.application.CdmApplicationController;
22 import eu.etaxonomy.cdm.app.common.CdmDestinations;
23 import eu.etaxonomy.cdm.database.DataSourceNotFoundException;
24 import eu.etaxonomy.cdm.database.DbSchemaValidation;
25 import eu.etaxonomy.cdm.database.ICdmDataSource;
26 import eu.etaxonomy.cdm.model.common.Language;
27 import eu.etaxonomy.cdm.model.common.init.TermNotFoundException;
28 import eu.etaxonomy.cdm.model.description.DescriptionBase;
29 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
30 import eu.etaxonomy.cdm.model.description.Distribution;
31 import eu.etaxonomy.cdm.model.description.Feature;
32 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
33 import eu.etaxonomy.cdm.model.description.PresenceTerm;
34 import eu.etaxonomy.cdm.model.description.TaxonDescription;
35 import eu.etaxonomy.cdm.model.description.TextData;
36 import eu.etaxonomy.cdm.model.location.NamedArea;
37 import eu.etaxonomy.cdm.model.location.TdwgArea;
38 import eu.etaxonomy.cdm.model.taxon.Taxon;
39 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
40
41 /**
42 * @author a.mueller
43 * @created 17.10.2008
44 * @version 1.0
45 */
46 public class DipteraDistributionParser {
47 private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
48
49 final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
50 static Pattern pattern = null;
51
52 protected void doDistribution(CdmApplicationController app){
53 pattern = Pattern.compile(epiSplitter);
54 TransactionStatus txStatus = app.startTransaction();
55 List<TaxonBase> taxa = app.getTaxonService().getAllTaxonBases(1000000, 0);
56 for (TaxonBase taxon: taxa ){
57 if (taxon instanceof Taxon){
58 // unlazyDescription(app, (Taxon)taxon);
59 Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
60 for (DescriptionBase description: descriptions){
61 Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
62 descElements.addAll(description.getElements());
63
64 for (DescriptionElementBase descEl: descElements){
65 if (descEl.getFeature().equals(Feature.OCCURRENCE())){
66 if (descEl instanceof TextData){
67 String occString = ((TextData)descEl).getText(Language.ENGLISH());
68 parseOccurenceString(occString, description);
69 }
70 }
71 }
72 }
73 }
74 }
75 System.out.println("Unknowns: ");
76 for (String unknown: unrekognizedStrings){
77 System.out.println(unknown);
78 }
79 System.out.println("Distributions not recognized: " + countNot);
80 System.out.println("Distributions created: " + countYes);
81 app.commitTransaction(txStatus);
82 }
83
84 static Set<String> unrekognizedStrings = new HashSet<String>();
85 static int countNot = 0;
86 static int countYes = 0;
87
88 private void parseOccurenceString(String occString, DescriptionBase desc){
89 System.out.println(occString);
90 if (occString != null){
91 String[] words = pattern.split(occString);
92 int i = 0;
93 int countSkip = 0;
94 for (String word: words){
95 boolean isDoubtful = false;
96 if (countSkip > 0){
97 countSkip--;
98 }else if(word.contains("widesp") || word.equals("in")) {
99 //skip
100 }else if(word.trim().length() == 0){
101 //skip
102 }else{
103 if (word.endsWith(":") && word.length()<=4){
104 //Higher area
105 //TODO
106 }else{
107 word = word.trim();
108 if (word.contains("?")){
109 isDoubtful = true;
110 word = word.replace("?", "");
111 }
112 word = adaptWordsToTdwg(word);
113
114 if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){
115 for (countSkip = 1; countSkip <= 6; countSkip++){
116 word = word.trim();
117 if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){
118 if (words.length > i + countSkip){
119 word = word + " " + words[i + countSkip];
120 }
121 if (word.contains("?")){
122 isDoubtful = true;
123 word = word.replace("?", "");
124 }
125 word = adaptWordsToTdwg(word);
126 if ("".equals(word)){
127 break;
128 }
129 }else{
130 break;
131 }
132 }
133 }
134 if ("".equals(word)){
135 //countSkip = countSkip;
136 }else if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word) ){
137 if (word.contains("?")){
138 logger.warn("XXX");
139 }
140 countNot++;
141 System.out.println(" False:" + countNot + ": " + word);
142 unrekognizedStrings.add(word);
143 countSkip = 0;
144 }else{
145 PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();
146 if (isDoubleArea(word)){
147 NamedArea[] doubleArea = getDoubleArea(word);
148 for (NamedArea area : doubleArea){
149 Distribution distr = Distribution.NewInstance(area, term);
150 desc.addElement(distr);
151 }
152 }else{
153 NamedArea area = TdwgArea.getAreaByTdwgLabel(word);
154 if (isDoubtful){
155 term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
156 }
157 Distribution distr = Distribution.NewInstance(area, term);
158 desc.addElement(distr);
159 }
160 countYes++;
161 System.out.println(" True:" + countYes + ": " + word);
162 countSkip--;
163 }
164 }
165 }
166 i++;
167 }
168 }
169 }
170
171 private boolean isDoubleArea(String word){
172 if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
173 "southern Europe".equalsIgnoreCase(word) ||
174 "former USSR: North and Central European territory".equalsIgnoreCase(word)
175 ){
176 return true;
177 }else{
178 return false;
179 }
180 }
181
182 private NamedArea[] getDoubleArea(String word){
183 NamedArea[] result = new NamedArea[2];
184 if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
185 result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");
186 result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");
187 }else if ("southern Europe".equalsIgnoreCase(word)){
188 result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");
189 result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");
190 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
191 result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");
192 result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");
193 }else{
194 logger.warn("Double area not recognized");
195 }
196 return result;
197 }
198
199
200 static List<String> stopWords = new ArrayList<String>();
201 static List<String> unknownAreas = new ArrayList<String>();
202 static List<String> higherAreas = new ArrayList<String>();
203
204 private String adaptWordsToTdwg(String word){
205 word = word.replace(",", "").replace(".", "").replace(";", "");
206 word = word.replace("Caronlina", "Carolina");
207
208 word = word.trim();
209 if (word.endsWith("Is")){
210 word = word + ".";
211 }
212 if (stopWords.size() == 0){
213 initStopWords();
214 }
215
216 word = word.replace("Russia [North European territory]", "North European Russia");
217 word = word.replace("Russia North European territory", "North European Russia");
218 word = word.replace("Russia: North European territory", "North European Russia");
219 word = word.replace("Russia: North European territory", "North European Russia");
220
221 word = word.replace("Amber", "amber");
222
223
224 word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
225 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
226 word = word.replace("Bahama Is.", "Bahamas");
227 word = word.replace("Comores Is.", "Comoros");
228 word = word.replace("former Yugoslavia", "Yugoslavia");
229 word = word.replace("former Czechoslovakia", "Czechoslovakia");
230 word = word.replace("Rhodesia", "Zimbabwe");
231 if (!word.contains("El Salvador")){
232 word = word.replace("Salvador", "El Salvador");
233 }
234 word = word.replace("Vera Cruz", "Veracruz");
235 word = word.replace("Turkmenia", "Turkmenistan");
236 word = word.replace("Quebec", "Québec");
237 //word = word.replace("Quebec", "Qu+®bec");
238 //word = word.replace("Quebec", "Qu├®bec");
239
240 word = word.replace("Gambia", "Gambia, The");
241 word = word.replace("Mariana Is.", "Marianas");
242 word = word.replace("Kenia", "Kenya");
243 word = word.replace("Central Africa", "Central African Republic");
244 word = word.replace("Canal Zone", "");
245 //word = word.replace("Panama", "Panamá");
246 word = word.replace("Panama", "Panamá");
247 if (! word.contains("New South Wales")){
248 word = word.replace("Wales", "Great Britain");
249 }
250 word = word.replace("Java", "Jawa");
251 word = word.replace("former USSR: North European territory", "North European Russia");
252 word = word.replace("former USSR: South European territory", "South European Russia");
253 word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
254
255 word = word.replace("oceanian islands", "Pacific");
256 word = word.replace("Ussuri region", "Primorye");
257 word = word.replace("Galapagos Is.", "Galápagos");
258 //word = word.replace("Galapagos Is.", "Galápagos");
259 word = word.replace("Malaysia", "Peninsular Malaysia");
260 word = word.replace("Polynesic Is.", "South Solomons");
261
262 word = word.replace("Usbek SSR", "Uzbekistan");
263 word = word.replace("Mexican amber", "Mexico");
264 word = word.replace("Marocco", "Morocco");
265 word = word.replace("Trinidad", "Trinidad-Tobago");
266 word = word.replace("Haiti", "Haiti");
267 word = word.replace("Moluccas", "Maluku");
268 word = word.replace("Belau", "Palau");
269 word = word.replace("Dominican amber", "Dominican Republic");
270 word = word.replace("Far East", "Russian Far East");
271 word = word.replace("Tahiti", "Society Is.");
272
273
274 // unknownAreas.add("Baltic amber");
275 // unknownAreas.add("Arabia");
276
277 for (String stopWord : stopWords){
278 if (stopWord.equals(word)){
279 System.out.println(" STOP: " + word);
280 return "";
281 }
282 }
283 for (String unknownArea : unknownAreas){
284 if (unknownArea.equals(word)){
285 System.out.println(" UNKNOWN: " + word);
286 return "";
287 }
288 }
289 for (String higherArea : higherAreas){
290 if (higherArea.equals(word)){
291 return "";
292 }
293 }
294
295 //higher regions
296
297 return word;
298 }
299
300 private void initStopWords(){
301 stopWords.add("to");
302 stopWords.add("also");
303 stopWords.add("almost");
304 stopWords.add("and");
305 stopWords.add("cosmopolitan");
306 stopWords.add("s");
307 stopWords.add("Is");
308 stopWords.add("Is.");
309 stopWords.add("of");
310 stopWords.add("bordering areas");
311 stopWords.add("areas");
312 stopWords.add("USA");
313 stopWords.add("Australia"); // except for "widesp. in Australia" !!
314 stopWords.add("&");
315 stopWords.add("part");
316 stopWords.add("excl");
317 // stopWords.add("European territory"); //part of Russian distributions
318 stopWords.add("northern part");
319 stopWords.add("Distr:");
320
321 unknownAreas.add("Argentina");
322 //unknownAreas.add("Panama");
323 unknownAreas.add("South Africa");
324 unknownAreas.add("Indonesia");
325 unknownAreas.add("Chile");
326 // unknownAreas.add("Wales");
327 // unknownAreas.add("Java");
328 // unknownAreas.add("former USSR: North European territory");
329 // unknownAreas.add("former USSR: South European territory");
330 // unknownAreas.add("former USSR: Soviet Middle Asia");
331 // unknownAreas.add("former USSR: North and Central European territory");
332 // unknownAreas.add("oceanian islands");
333 // unknownAreas.add("Ussuri region");
334 // unknownAreas.add("Galapagos Is.");
335 // unknownAreas.add("Malaysia"); // Malaysia Peninsular exists (level 4)
336 unknownAreas.add("West Indies"); //-> as a whole
337 // unknownAreas.add("Canal Zone");
338 // unknownAreas.add("Polynesic Is.");
339 // unknownAreas.add("Usbek SSR");
340 // unknownAreas.add("Mexican amber");
341 // unknownAreas.add("southern Europe"); // ->Southeastern Europe, Southwestern Europe
342 // unknownAreas.add("Marocco");
343 // unknownAreas.add("Trinidad"); //-> Trinidad-Tobago
344 // unknownAreas.add("Haiti");
345 // unknownAreas.add("Moluccas"); //-> Indonesia
346 // unknownAreas.add("Belau");
347 unknownAreas.add("Baltic amber");
348 unknownAreas.add("Arabia");
349 // unknownAreas.add("Dominican amber");
350 // unknownAreas.add("Canary and Madeira Is."); //-> Canary Is. / Madeira
351 // unknownAreas.add("Dominican amber");
352 // unknownAreas.add("Far East");
353 // unknownAreas.add("Tahiti");
354
355 higherAreas.add("AF");
356 higherAreas.add("OR");
357 higherAreas.add("PA");
358 higherAreas.add("AU");
359 higherAreas.add("NE");
360
361 higherAreas.add("NT");
362 }
363
364
365 /**
366 * @param args
367 */
368 public static void main(String[] args) {
369 ICdmDataSource cdmDestination = CdmDestinations.localH2();
370 CdmApplicationController app = null;
371 try {
372 DbSchemaValidation val = DbSchemaValidation.UPDATE;
373 app = CdmApplicationController.NewInstance(cdmDestination, val);
374 } catch (DataSourceNotFoundException e) {
375 e.printStackTrace();
376 } catch (TermNotFoundException e) {
377 e.printStackTrace();
378 }
379 DipteraDistributionParser dipDist = new DipteraDistributionParser();
380 if (app != null){
381 dipDist.doDistribution(app);
382 }else{
383 logger.warn("No Application Context");
384 }
385 }
386 }