2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
11 * Copyright (C) 2007 EDIT
12 * European Distributed Institute of Taxonomy
13 * http://www.e-taxonomy.eu
15 * The contents of this file are subject to the Mozilla Public License Version 1.1
16 * See LICENSE.TXT at the top of this package for the full license terms.
18 package eu
.etaxonomy
.cdm
.app
.wp6
.diptera
;
20 import java
.util
.ArrayList
;
21 import java
.util
.HashSet
;
22 import java
.util
.List
;
24 import java
.util
.regex
.Pattern
;
26 import org
.apache
.log4j
.Logger
;
27 import org
.springframework
.transaction
.TransactionStatus
;
29 import eu
.etaxonomy
.cdm
.api
.application
.CdmApplicationController
;
30 import eu
.etaxonomy
.cdm
.api
.application
.ICdmRepository
;
31 import eu
.etaxonomy
.cdm
.app
.common
.CdmDestinations
;
32 import eu
.etaxonomy
.cdm
.database
.DbSchemaValidation
;
33 import eu
.etaxonomy
.cdm
.database
.ICdmDataSource
;
34 import eu
.etaxonomy
.cdm
.io
.common
.TdwgAreaProvider
;
35 import eu
.etaxonomy
.cdm
.model
.common
.Language
;
36 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
37 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
38 import eu
.etaxonomy
.cdm
.model
.description
.Distribution
;
39 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
40 import eu
.etaxonomy
.cdm
.model
.description
.PresenceAbsenceTerm
;
41 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
42 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
43 import eu
.etaxonomy
.cdm
.model
.location
.NamedArea
;
44 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
45 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
52 public class DipteraDistributionParser
{
53 private static final Logger logger
= Logger
.getLogger(DipteraDistributionParser
.class);
55 private static ICdmDataSource cdmDestination
= CdmDestinations
.localH2();
57 final static String epiSplitter
= "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
58 static Pattern pattern
= null;
60 protected void doDistribution(ICdmRepository app
){
61 pattern
= Pattern
.compile(epiSplitter
);
62 TransactionStatus txStatus
= app
.startTransaction();
63 List
<TaxonBase
> taxa
= app
.getTaxonService().list(null, null, null, null, null);
64 for (TaxonBase taxon
: taxa
){
65 if (taxon
instanceof Taxon
){
66 // unlazyDescription(app, (Taxon)taxon);
67 Set
<TaxonDescription
> descriptions
= ((Taxon
) taxon
).getDescriptions();
68 for (DescriptionBase description
: descriptions
){
69 Set
<DescriptionElementBase
> descElements
= new HashSet
<DescriptionElementBase
>();
70 descElements
.addAll(description
.getElements());
72 for (DescriptionElementBase descEl
: descElements
){
73 if (descEl
.getFeature().equals(Feature
.OCCURRENCE())){
74 if (descEl
instanceof TextData
){
75 String occString
= ((TextData
)descEl
).getText(Language
.ENGLISH());
76 parseOccurenceString(occString
, description
);
77 //app.getTaxonService().saveTaxon(taxon);
84 System
.out
.println("Unknowns: ");
85 for (String unknown
: unrekognizedStrings
){
86 System
.out
.println(unknown
);
88 System
.out
.println("Distributions not recognized: " + countNot
);
89 System
.out
.println("Distributions created: " + countYes
);
90 app
.commitTransaction(txStatus
);
93 static Set
<String
> unrekognizedStrings
= new HashSet
<String
>();
94 static int countNot
= 0;
95 static int countYes
= 0;
97 private void parseOccurenceString(String occString
, DescriptionBase desc
){
98 System
.out
.println(occString
);
99 if (occString
!= null){
100 String
[] words
= pattern
.split(occString
);
103 for (String word
: words
){
104 if (word
.contains("U.S.A")){
105 logger
.warn("U.S.A.");
107 boolean isDoubtful
= false;
110 }else if(word
.trim().length() == 0){
113 if (word
.endsWith(":") && word
.length()<=4){
118 if (word
.contains("?")){
120 word
= word
.replace("?", "");
122 word
= adaptWordsToTdwg(word
);
124 if (! "".equals(word
) && ! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
125 for (countSkip
= 1; countSkip
<= 6; countSkip
++){
127 if (! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
128 if (words
.length
> i
+ countSkip
){
129 word
= word
+ " " + words
[i
+ countSkip
];
131 if (word
.contains("?")){
133 word
= word
.replace("?", "");
135 word
= adaptWordsToTdwg(word
);
136 if ("".equals(word
)){
144 if ("".equals(word
)){
145 //countSkip = countSkip;
146 }else if (! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
) ){
147 if (word
.contains("?")){
151 System
.out
.println(" False:" + countNot
+ ": " + word
);
152 unrekognizedStrings
.add(word
);
155 if (word
.equals("Netherlands")){
156 if ( countSkip
< 0 && words
[i
+ 1].startsWith("Antilles")){
157 word
= "Netherlands Antilles";
161 PresenceAbsenceTerm term
= PresenceAbsenceTerm
.PRESENT();
162 if (isDoubleArea(word
)){
163 NamedArea
[] doubleArea
= getDoubleArea(word
);
164 for (NamedArea area
: doubleArea
){
165 Distribution distr
= Distribution
.NewInstance(area
, term
);
166 desc
.addElement(distr
);
170 if (TdwgAreaProvider
.isTdwgAreaLabel(word
)){
171 area
= TdwgAreaProvider
.getAreaByTdwgLabel(word
);
173 area
= TdwgAreaProvider
.getAreaByTdwgAbbreviation(word
);
176 term
= PresenceAbsenceTerm
.INTRODUCED_PRESENCE_QUESTIONABLE();
178 Distribution distr
= Distribution
.NewInstance(area
, term
);
179 desc
.addElement(distr
);
182 System
.out
.println(" True:" + countYes
+ ": " + word
);
192 private boolean isDoubleArea(String word
){
193 if ("Canary and Madeira Is.".equalsIgnoreCase(word
) ||
194 "southern Europe".equalsIgnoreCase(word
) ||
195 "former USSR: North and Central European territory".equalsIgnoreCase(word
)
203 private NamedArea
[] getDoubleArea(String word
){
204 NamedArea
[] result
= new NamedArea
[2];
205 if ("Canary and Madeira Is.".equalsIgnoreCase(word
)){
206 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("CNY");
207 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("MDR");
208 }else if ("southern Europe".equalsIgnoreCase(word
)){
209 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("12");
210 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("13");
211 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word
)){
212 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("RUN-OO");
213 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("RUC-OO");
215 logger
.warn("Double area not recognized");
221 static List
<String
> stopWords
= new ArrayList
<String
>();
222 static List
<String
> unknownAreas
= new ArrayList
<String
>();
223 static List
<String
> higherAreas
= new ArrayList
<String
>();
225 private String
adaptWordsToTdwg(String word
){
226 word
= word
.replace(",", "").replace(";", "");
227 if (! word
.contains("U.S.A")){
228 word
= word
.replace(",", "").replace(".", "").replace(";", "");
230 word
= word
.replace(",", "").replace(";", "");
234 if (word
.endsWith("Is")){
237 if (stopWords
.size() == 0){
241 word
= word
.replace("Russia [North European territory]", "North European Russia");
242 word
= word
.replace("Russia North European territory", "North European Russia");
243 word
= word
.replace("Russia: North European territory", "North European Russia");
244 word
= word
.replace("Russia: North European territory", "North European Russia");
246 word
= word
.replace("Amber", "amber");
249 word
= word
.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
250 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
251 word
= word
.replace("Bahama Is.", "Bahamas");
252 word
= word
.replace("Comores Is.", "Comoros");
253 word
= word
.replace("former Yugoslavia", "Yugoslavia");
254 word
= word
.replace("former Czechoslovakia", "Czechoslovakia");
255 word
= word
.replace("Rhodesia", "Zimbabwe");
256 word
= word
.replace("The Gambia", "Gambia, The");
258 if (!word
.contains("El Salvador")){
259 word
= word
.replace("Salvador", "El Salvador");
261 word
= word
.replace("Vera Cruz", "Veracruz");
262 word
= word
.replace("Turkmenia", "Turkmenistan");
263 word
= word
.replace("Qu\u00E9beck", "Qu\u00E9bec");
264 word
= word
.replace("Quebeck", "Qu\u00E9bec");
265 word
= word
.replace("Quebec", "Qu\u00E9bec");
267 if (!word
.contains("Gambia, The")){
268 word
= word
.replace("Gambia", "Gambia, The");
270 word
= word
.replace("Mariana Is.", "Marianas");
271 word
= word
.replace("Kenia", "Kenya");
272 word
= word
.replace("Central Africa", "Central African Republic");
273 word
= word
.replace("Canal Zone", "");
274 //word = word.replace("Panama", "Panamá");
275 word
= word
.replace("Panama", "Panam\u00E1");
276 if (! word
.contains("New South Wales")){
277 word
= word
.replace("Wales", "Great Britain");
279 word
= word
.replace("Java", "Jawa");
280 word
= word
.replace("former USSR: North European territory", "North European Russia");
281 word
= word
.replace("former USSR: South European territory", "South European Russia");
282 word
= word
.replace("former USSR: Soviet Middle Asia", "Middle Asia");
284 word
= word
.replace("St Kitts-Nevis", "St.Kitts-Nevis");
286 word
= word
.replace("oceanian islands", "Pacific");
287 word
= word
.replace("Ussuri region", "Primorye");
288 word
= word
.replace("Galapagos Is.", "Gal\u00E1pagos");
289 word
= word
.replace("Tarapac\u00E1", "Tarapaca");
290 word
= word
.replace("Reunion", "R\u00E9union");
291 if (! word
.contains("Is.")){
292 word
= word
.replace("Galapagos", "Gal\u00E1pagos");
295 //word = word.replace("Galapagos Is.", "Galápagos");
296 if (! word
.contains("Peninsular")){
297 word
= word
.replace("Malaysia", "Peninsular Malaysia");
299 word
= word
.replace("Polynesic Is.", "South Solomons");
301 word
= word
.replace("Usbek SSR", "Uzbekistan");
302 word
= word
.replace("Mexican amber", "Mexico");
303 word
= word
.replace("Marocco", "Morocco");
304 if (! word
.contains("Tobago")){
305 word
= word
.replace("Trinidad", "Trinidad-Tobago");
307 if (! word
.contains("Trinidad")){
308 word
= word
.replace("Tobago", "Trinidad-Tobago");
310 word
= word
.replace("Haiti", "Haiti");
311 word
= word
.replace("Moluccas", "Maluku");
312 word
= word
.replace("Belau", "Palau");
313 word
= word
.replace("Dominican amber", "Dominican Republic");
314 if (! word
.contains("Russian")){
315 word
= word
.replace("Far East", "Russian Far East");
317 word
= word
.replace("Tahiti", "Society Is.");
318 word
= word
.replace("Iraque", "Iraq");
319 word
= word
.replace("Wake Island", "Wake I.");
320 if (! word
.contains("I.")){
321 word
= word
.replace("Johnston I", "Johnston I.");
322 word
= word
.replace("Wake I", "Wake I.");
323 word
= word
.replace("Clipperton I", "Clipperton I.");
325 if (! word
.contains("Provinces")){
326 word
= word
.replace("Cape Province", "Cape Provinces");
328 word
= word
.replace("Eastern Cape Provinces", "Eastern Cape Province");
329 word
= word
.replace("Western Cape Provinces", "Western Cape Province");
330 if (! word
.contains("Barbuda")){
331 word
= word
.replace("Antigua", "Antigua-Barbuda");
333 if (! word
.contains("St.")){
334 word
= word
.replace("St Vincent", "St.Vincent");
335 word
= word
.replace("St Lucia", "St.Lucia");
336 word
= word
.replace("St Helena", "St.Helena");
338 word
= word
.replace("Asia-tropical", "Asia-Tropical");
339 word
= word
.replace("Society Islands", "Society Is.");
340 word
= word
.replace("Virgin Islands", "Virgin Is.");
341 word
= word
.replace("Canary Islands", "Canary Is.");
342 word
= word
.replace("Rhode Island", "Rhode I.");
345 word
= word
.replace("Rodriguez", "Rodrigues");
346 word
= word
.replace("British Colombia", "British Columbia");
347 word
= word
.replace("Bermudas", "Bermuda");
348 word
= word
.replace("Tunesia", "Tunisia");
349 word
= word
.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
350 word
= word
.replace("Transvaal", "Northern Provinces");
351 word
= word
.replace("Tucum\u00E1n", "Tucuman");
352 // if (!word.contains("Netherlands")){
356 // unknownAreas.add("Baltic amber");
357 // unknownAreas.add("Arabia");
359 for (String stopWord
: stopWords
){
360 if (stopWord
.equals(word
)){
361 System
.out
.println(" STOP: " + word
);
365 for (String unknownArea
: unknownAreas
){
366 if (unknownArea
.equals(word
)){
367 System
.out
.println(" UNKNOWN: " + word
);
371 for (String higherArea
: higherAreas
){
372 if (higherArea
.equals(word
)){
382 private void initStopWords(){
383 stopWords
.add("and");
385 stopWords
.add("Is.");
386 stopWords
.add("Islands");
387 stopWords
.add("Island");
390 stopWords
.add("areas");
391 stopWords
.add("USA");
392 stopWords
.add("Australia"); //except for Australia only
393 stopWords
.add("Argentina");
395 //unknownAreas.add("Panama");
396 unknownAreas
.add("South Africa");
397 unknownAreas
.add("Chile");
399 unknownAreas
.add("Baltic amber");
400 unknownAreas
.add("Arabia");
403 higherAreas
.add("AF");
404 higherAreas
.add("OR");
405 higherAreas
.add("PA");
406 higherAreas
.add("AU");
407 higherAreas
.add("NE");
409 higherAreas
.add("NT");
416 public static void main(String
[] args
) {
417 CdmApplicationController app
= null;
418 DbSchemaValidation val
= DbSchemaValidation
.UPDATE
;
419 app
= CdmApplicationController
.NewInstance(cdmDestination
, val
);
421 DipteraDistributionParser dipDist
= new DipteraDistributionParser();
423 dipDist
.doDistribution(app
);
425 logger
.warn("No Application Context");