2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
11 * Copyright (C) 2007 EDIT
12 * European Distributed Institute of Taxonomy
13 * http://www.e-taxonomy.eu
15 * The contents of this file are subject to the Mozilla Public License Version 1.1
16 * See LICENSE.TXT at the top of this package for the full license terms.
18 package eu
.etaxonomy
.cdm
.app
.wp6
.diptera
;
20 import java
.util
.ArrayList
;
21 import java
.util
.HashSet
;
22 import java
.util
.List
;
24 import java
.util
.regex
.Pattern
;
26 import org
.apache
.log4j
.Logger
;
27 import org
.springframework
.transaction
.TransactionStatus
;
29 import eu
.etaxonomy
.cdm
.api
.application
.CdmApplicationController
;
30 import eu
.etaxonomy
.cdm
.api
.application
.ICdmRepository
;
31 import eu
.etaxonomy
.cdm
.app
.common
.CdmDestinations
;
32 import eu
.etaxonomy
.cdm
.database
.DbSchemaValidation
;
33 import eu
.etaxonomy
.cdm
.database
.ICdmDataSource
;
34 import eu
.etaxonomy
.cdm
.io
.common
.TdwgAreaProvider
;
35 import eu
.etaxonomy
.cdm
.model
.common
.Language
;
36 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
37 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
38 import eu
.etaxonomy
.cdm
.model
.description
.Distribution
;
39 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
40 import eu
.etaxonomy
.cdm
.model
.description
.PresenceAbsenceTerm
;
41 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
42 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
43 import eu
.etaxonomy
.cdm
.model
.location
.NamedArea
;
44 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
45 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
51 public class DipteraDistributionParser
{
52 private static final Logger logger
= Logger
.getLogger(DipteraDistributionParser
.class);
54 private static ICdmDataSource cdmDestination
= CdmDestinations
.localH2();
56 final static String epiSplitter
= "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
57 static Pattern pattern
= null;
59 protected void doDistribution(ICdmRepository app
){
60 pattern
= Pattern
.compile(epiSplitter
);
61 TransactionStatus txStatus
= app
.startTransaction();
62 List
<TaxonBase
> taxa
= app
.getTaxonService().list(null, null, null, null, null);
63 for (TaxonBase taxon
: taxa
){
64 if (taxon
instanceof Taxon
){
65 // unlazyDescription(app, (Taxon)taxon);
66 Set
<TaxonDescription
> descriptions
= ((Taxon
) taxon
).getDescriptions();
67 for (DescriptionBase description
: descriptions
){
68 Set
<DescriptionElementBase
> descElements
= new HashSet
<DescriptionElementBase
>();
69 descElements
.addAll(description
.getElements());
71 for (DescriptionElementBase descEl
: descElements
){
72 if (descEl
.getFeature().equals(Feature
.OCCURRENCE())){
73 if (descEl
instanceof TextData
){
74 String occString
= ((TextData
)descEl
).getText(Language
.ENGLISH());
75 parseOccurenceString(occString
, description
);
76 //app.getTaxonService().saveTaxon(taxon);
83 System
.out
.println("Unknowns: ");
84 for (String unknown
: unrekognizedStrings
){
85 System
.out
.println(unknown
);
87 System
.out
.println("Distributions not recognized: " + countNot
);
88 System
.out
.println("Distributions created: " + countYes
);
89 app
.commitTransaction(txStatus
);
92 static Set
<String
> unrekognizedStrings
= new HashSet
<>();
93 static int countNot
= 0;
94 static int countYes
= 0;
96 private void parseOccurenceString(String occString
, DescriptionBase desc
){
97 System
.out
.println(occString
);
98 if (occString
!= null){
99 String
[] words
= pattern
.split(occString
);
102 for (String word
: words
){
103 if (word
.contains("U.S.A")){
104 logger
.warn("U.S.A.");
106 boolean isDoubtful
= false;
109 }else if(word
.trim().length() == 0){
112 if (word
.endsWith(":") && word
.length()<=4){
117 if (word
.contains("?")){
119 word
= word
.replace("?", "");
121 word
= adaptWordsToTdwg(word
);
123 if (! "".equals(word
) && ! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
124 for (countSkip
= 1; countSkip
<= 6; countSkip
++){
126 if (! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
127 if (words
.length
> i
+ countSkip
){
128 word
= word
+ " " + words
[i
+ countSkip
];
130 if (word
.contains("?")){
132 word
= word
.replace("?", "");
134 word
= adaptWordsToTdwg(word
);
135 if ("".equals(word
)){
143 if ("".equals(word
)){
144 //countSkip = countSkip;
145 }else if (! TdwgAreaProvider
.isTdwgAreaLabel(word
) && ! TdwgAreaProvider
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
) ){
146 if (word
.contains("?")){
150 System
.out
.println(" False:" + countNot
+ ": " + word
);
151 unrekognizedStrings
.add(word
);
154 if (word
.equals("Netherlands")){
155 if ( countSkip
< 0 && words
[i
+ 1].startsWith("Antilles")){
156 word
= "Netherlands Antilles";
160 PresenceAbsenceTerm term
= PresenceAbsenceTerm
.PRESENT();
161 if (isDoubleArea(word
)){
162 NamedArea
[] doubleArea
= getDoubleArea(word
);
163 for (NamedArea area
: doubleArea
){
164 Distribution distr
= Distribution
.NewInstance(area
, term
);
165 desc
.addElement(distr
);
169 if (TdwgAreaProvider
.isTdwgAreaLabel(word
)){
170 area
= TdwgAreaProvider
.getAreaByTdwgLabel(word
);
172 area
= TdwgAreaProvider
.getAreaByTdwgAbbreviation(word
);
175 term
= PresenceAbsenceTerm
.INTRODUCED_PRESENCE_QUESTIONABLE();
177 Distribution distr
= Distribution
.NewInstance(area
, term
);
178 desc
.addElement(distr
);
181 System
.out
.println(" True:" + countYes
+ ": " + word
);
191 private boolean isDoubleArea(String word
){
192 if ("Canary and Madeira Is.".equalsIgnoreCase(word
) ||
193 "southern Europe".equalsIgnoreCase(word
) ||
194 "former USSR: North and Central European territory".equalsIgnoreCase(word
)
202 private NamedArea
[] getDoubleArea(String word
){
203 NamedArea
[] result
= new NamedArea
[2];
204 if ("Canary and Madeira Is.".equalsIgnoreCase(word
)){
205 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("CNY");
206 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("MDR");
207 }else if ("southern Europe".equalsIgnoreCase(word
)){
208 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("12");
209 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("13");
210 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word
)){
211 result
[0] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("RUN-OO");
212 result
[1] = TdwgAreaProvider
.getAreaByTdwgAbbreviation("RUC-OO");
214 logger
.warn("Double area not recognized");
220 static List
<String
> stopWords
= new ArrayList
<>();
221 static List
<String
> unknownAreas
= new ArrayList
<>();
222 static List
<String
> higherAreas
= new ArrayList
<>();
224 private String
adaptWordsToTdwg(String word
){
225 word
= word
.replace(",", "").replace(";", "");
226 if (! word
.contains("U.S.A")){
227 word
= word
.replace(",", "").replace(".", "").replace(";", "");
229 word
= word
.replace(",", "").replace(";", "");
233 if (word
.endsWith("Is")){
236 if (stopWords
.size() == 0){
240 word
= word
.replace("Russia [North European territory]", "North European Russia");
241 word
= word
.replace("Russia North European territory", "North European Russia");
242 word
= word
.replace("Russia: North European territory", "North European Russia");
243 word
= word
.replace("Russia: North European territory", "North European Russia");
245 word
= word
.replace("Amber", "amber");
248 word
= word
.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
249 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
250 word
= word
.replace("Bahama Is.", "Bahamas");
251 word
= word
.replace("Comores Is.", "Comoros");
252 word
= word
.replace("former Yugoslavia", "Yugoslavia");
253 word
= word
.replace("former Czechoslovakia", "Czechoslovakia");
254 word
= word
.replace("Rhodesia", "Zimbabwe");
255 word
= word
.replace("The Gambia", "Gambia, The");
257 if (!word
.contains("El Salvador")){
258 word
= word
.replace("Salvador", "El Salvador");
260 word
= word
.replace("Vera Cruz", "Veracruz");
261 word
= word
.replace("Turkmenia", "Turkmenistan");
262 word
= word
.replace("Qu\u00E9beck", "Qu\u00E9bec");
263 word
= word
.replace("Quebeck", "Qu\u00E9bec");
264 word
= word
.replace("Quebec", "Qu\u00E9bec");
266 if (!word
.contains("Gambia, The")){
267 word
= word
.replace("Gambia", "Gambia, The");
269 word
= word
.replace("Mariana Is.", "Marianas");
270 word
= word
.replace("Kenia", "Kenya");
271 word
= word
.replace("Central Africa", "Central African Republic");
272 word
= word
.replace("Canal Zone", "");
273 //word = word.replace("Panama", "Panamá");
274 word
= word
.replace("Panama", "Panam\u00E1");
275 if (! word
.contains("New South Wales")){
276 word
= word
.replace("Wales", "Great Britain");
278 word
= word
.replace("Java", "Jawa");
279 word
= word
.replace("former USSR: North European territory", "North European Russia");
280 word
= word
.replace("former USSR: South European territory", "South European Russia");
281 word
= word
.replace("former USSR: Soviet Middle Asia", "Middle Asia");
283 word
= word
.replace("St Kitts-Nevis", "St.Kitts-Nevis");
285 word
= word
.replace("oceanian islands", "Pacific");
286 word
= word
.replace("Ussuri region", "Primorye");
287 word
= word
.replace("Galapagos Is.", "Gal\u00E1pagos");
288 word
= word
.replace("Tarapac\u00E1", "Tarapaca");
289 word
= word
.replace("Reunion", "R\u00E9union");
290 if (! word
.contains("Is.")){
291 word
= word
.replace("Galapagos", "Gal\u00E1pagos");
294 //word = word.replace("Galapagos Is.", "Galápagos");
295 if (! word
.contains("Peninsular")){
296 word
= word
.replace("Malaysia", "Peninsular Malaysia");
298 word
= word
.replace("Polynesic Is.", "South Solomons");
300 word
= word
.replace("Usbek SSR", "Uzbekistan");
301 word
= word
.replace("Mexican amber", "Mexico");
302 word
= word
.replace("Marocco", "Morocco");
303 if (! word
.contains("Tobago")){
304 word
= word
.replace("Trinidad", "Trinidad-Tobago");
306 if (! word
.contains("Trinidad")){
307 word
= word
.replace("Tobago", "Trinidad-Tobago");
309 word
= word
.replace("Haiti", "Haiti");
310 word
= word
.replace("Moluccas", "Maluku");
311 word
= word
.replace("Belau", "Palau");
312 word
= word
.replace("Dominican amber", "Dominican Republic");
313 if (! word
.contains("Russian")){
314 word
= word
.replace("Far East", "Russian Far East");
316 word
= word
.replace("Tahiti", "Society Is.");
317 word
= word
.replace("Iraque", "Iraq");
318 word
= word
.replace("Wake Island", "Wake I.");
319 if (! word
.contains("I.")){
320 word
= word
.replace("Johnston I", "Johnston I.");
321 word
= word
.replace("Wake I", "Wake I.");
322 word
= word
.replace("Clipperton I", "Clipperton I.");
324 if (! word
.contains("Provinces")){
325 word
= word
.replace("Cape Province", "Cape Provinces");
327 word
= word
.replace("Eastern Cape Provinces", "Eastern Cape Province");
328 word
= word
.replace("Western Cape Provinces", "Western Cape Province");
329 if (! word
.contains("Barbuda")){
330 word
= word
.replace("Antigua", "Antigua-Barbuda");
332 if (! word
.contains("St.")){
333 word
= word
.replace("St Vincent", "St.Vincent");
334 word
= word
.replace("St Lucia", "St.Lucia");
335 word
= word
.replace("St Helena", "St.Helena");
337 word
= word
.replace("Asia-tropical", "Asia-Tropical");
338 word
= word
.replace("Society Islands", "Society Is.");
339 word
= word
.replace("Virgin Islands", "Virgin Is.");
340 word
= word
.replace("Canary Islands", "Canary Is.");
341 word
= word
.replace("Rhode Island", "Rhode I.");
344 word
= word
.replace("Rodriguez", "Rodrigues");
345 word
= word
.replace("British Colombia", "British Columbia");
346 word
= word
.replace("Bermudas", "Bermuda");
347 word
= word
.replace("Tunesia", "Tunisia");
348 word
= word
.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
349 word
= word
.replace("Transvaal", "Northern Provinces");
350 word
= word
.replace("Tucum\u00E1n", "Tucuman");
351 // if (!word.contains("Netherlands")){
355 // unknownAreas.add("Baltic amber");
356 // unknownAreas.add("Arabia");
358 for (String stopWord
: stopWords
){
359 if (stopWord
.equals(word
)){
360 System
.out
.println(" STOP: " + word
);
364 for (String unknownArea
: unknownAreas
){
365 if (unknownArea
.equals(word
)){
366 System
.out
.println(" UNKNOWN: " + word
);
370 for (String higherArea
: higherAreas
){
371 if (higherArea
.equals(word
)){
381 private void initStopWords(){
382 stopWords
.add("and");
384 stopWords
.add("Is.");
385 stopWords
.add("Islands");
386 stopWords
.add("Island");
389 stopWords
.add("areas");
390 stopWords
.add("USA");
391 stopWords
.add("Australia"); //except for Australia only
392 stopWords
.add("Argentina");
394 //unknownAreas.add("Panama");
395 unknownAreas
.add("South Africa");
396 unknownAreas
.add("Chile");
398 unknownAreas
.add("Baltic amber");
399 unknownAreas
.add("Arabia");
402 higherAreas
.add("AF");
403 higherAreas
.add("OR");
404 higherAreas
.add("PA");
405 higherAreas
.add("AU");
406 higherAreas
.add("NE");
408 higherAreas
.add("NT");
411 public static void main(String
[] args
) {
412 CdmApplicationController app
= null;
413 DbSchemaValidation val
= DbSchemaValidation
.UPDATE
;
414 app
= CdmApplicationController
.NewInstance(cdmDestination
, val
);
416 DipteraDistributionParser dipDist
= new DipteraDistributionParser();
418 dipDist
.doDistribution(app
);
420 logger
.warn("No Application Context");