2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.app
.berlinModelImport
;
12 import java
.util
.ArrayList
;
13 import java
.util
.HashSet
;
14 import java
.util
.List
;
16 import java
.util
.regex
.Pattern
;
18 import org
.apache
.log4j
.Logger
;
19 import org
.springframework
.transaction
.TransactionStatus
;
21 import eu
.etaxonomy
.cdm
.api
.application
.CdmApplicationController
;
22 import eu
.etaxonomy
.cdm
.app
.common
.CdmDestinations
;
23 import eu
.etaxonomy
.cdm
.database
.DataSourceNotFoundException
;
24 import eu
.etaxonomy
.cdm
.database
.DbSchemaValidation
;
25 import eu
.etaxonomy
.cdm
.database
.ICdmDataSource
;
26 import eu
.etaxonomy
.cdm
.model
.common
.Language
;
27 import eu
.etaxonomy
.cdm
.model
.common
.init
.TermNotFoundException
;
28 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
29 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
30 import eu
.etaxonomy
.cdm
.model
.description
.Distribution
;
31 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
32 import eu
.etaxonomy
.cdm
.model
.description
.PresenceAbsenceTermBase
;
33 import eu
.etaxonomy
.cdm
.model
.description
.PresenceTerm
;
34 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
35 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
36 import eu
.etaxonomy
.cdm
.model
.location
.NamedArea
;
37 import eu
.etaxonomy
.cdm
.model
.location
.TdwgArea
;
38 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
46 public class DipteraDistributionParser
{
47 private static final Logger logger
= Logger
.getLogger(DipteraDistributionParser
.class);
49 final static String epiSplitter
= "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
50 static Pattern pattern
= null;
52 protected void doDistribution(CdmApplicationController app
){
53 pattern
= Pattern
.compile(epiSplitter
);
54 TransactionStatus txStatus
= app
.startTransaction();
55 List
<TaxonBase
> taxa
= app
.getTaxonService().getAllTaxonBases(1000000, 0);
56 for (TaxonBase taxon
: taxa
){
57 if (taxon
instanceof Taxon
){
58 // unlazyDescription(app, (Taxon)taxon);
59 Set
<TaxonDescription
> descriptions
= ((Taxon
) taxon
).getDescriptions();
60 for (DescriptionBase description
: descriptions
){
61 Set
<DescriptionElementBase
> descElements
= new HashSet
<DescriptionElementBase
>();
62 descElements
.addAll(description
.getElements());
64 for (DescriptionElementBase descEl
: descElements
){
65 if (descEl
.getFeature().equals(Feature
.OCCURRENCE())){
66 if (descEl
instanceof TextData
){
67 String occString
= ((TextData
)descEl
).getText(Language
.ENGLISH());
68 parseOccurenceString(occString
, description
);
69 //app.getTaxonService().saveTaxon(taxon);
76 System
.out
.println("Unknowns: ");
77 for (String unknown
: unrekognizedStrings
){
78 System
.out
.println(unknown
);
80 System
.out
.println("Distributions not recognized: " + countNot
);
81 System
.out
.println("Distributions created: " + countYes
);
82 app
.commitTransaction(txStatus
);
85 static Set
<String
> unrekognizedStrings
= new HashSet
<String
>();
86 static int countNot
= 0;
87 static int countYes
= 0;
89 private void parseOccurenceString(String occString
, DescriptionBase desc
){
90 System
.out
.println(occString
);
91 if (occString
!= null){
92 String
[] words
= pattern
.split(occString
);
95 for (String word
: words
){
96 if (word
.contains("U.S.A")){
97 logger
.warn("U.S.A.");
99 boolean isDoubtful
= false;
102 }else if(word
.trim().length() == 0){
105 if (word
.endsWith(":") && word
.length()<=4){
110 if (word
.contains("?")){
112 word
= word
.replace("?", "");
114 word
= adaptWordsToTdwg(word
);
116 if (! "".equals(word
) && ! TdwgArea
.isTdwgAreaLabel(word
) && ! TdwgArea
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
117 for (countSkip
= 1; countSkip
<= 6; countSkip
++){
119 if (! TdwgArea
.isTdwgAreaLabel(word
) && ! TdwgArea
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
)){
120 if (words
.length
> i
+ countSkip
){
121 word
= word
+ " " + words
[i
+ countSkip
];
123 if (word
.contains("?")){
125 word
= word
.replace("?", "");
127 word
= adaptWordsToTdwg(word
);
128 if ("".equals(word
)){
136 if ("".equals(word
)){
137 //countSkip = countSkip;
138 }else if (! TdwgArea
.isTdwgAreaLabel(word
) && ! TdwgArea
.isTdwgAreaAbbreviation(word
) && ! isDoubleArea(word
) ){
139 if (word
.contains("?")){
143 System
.out
.println(" False:" + countNot
+ ": " + word
);
144 unrekognizedStrings
.add(word
);
147 PresenceAbsenceTermBase
<?
> term
= PresenceTerm
.PRESENT();
148 if (isDoubleArea(word
)){
149 NamedArea
[] doubleArea
= getDoubleArea(word
);
150 for (NamedArea area
: doubleArea
){
151 Distribution distr
= Distribution
.NewInstance(area
, term
);
152 desc
.addElement(distr
);
156 if (TdwgArea
.isTdwgAreaLabel(word
)){
157 area
= TdwgArea
.getAreaByTdwgLabel(word
);
159 area
= TdwgArea
.getAreaByTdwgAbbreviation(word
);
162 term
= PresenceTerm
.INTRODUCED_PRESENCE_QUESTIONABLE();
164 Distribution distr
= Distribution
.NewInstance(area
, term
);
165 desc
.addElement(distr
);
168 System
.out
.println(" True:" + countYes
+ ": " + word
);
178 private boolean isDoubleArea(String word
){
179 if ("Canary and Madeira Is.".equalsIgnoreCase(word
) ||
180 "southern Europe".equalsIgnoreCase(word
) ||
181 "former USSR: North and Central European territory".equalsIgnoreCase(word
)
189 private NamedArea
[] getDoubleArea(String word
){
190 NamedArea
[] result
= new NamedArea
[2];
191 if ("Canary and Madeira Is.".equalsIgnoreCase(word
)){
192 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("CNY");
193 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("MDR");
194 }else if ("southern Europe".equalsIgnoreCase(word
)){
195 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("12");
196 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("13");
197 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word
)){
198 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("RUN-OO");
199 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("RUC-OO");
201 logger
.warn("Double area not recognized");
207 static List
<String
> stopWords
= new ArrayList
<String
>();
208 static List
<String
> unknownAreas
= new ArrayList
<String
>();
209 static List
<String
> higherAreas
= new ArrayList
<String
>();
211 private String
adaptWordsToTdwg(String word
){
212 word
= word
.replace(",", "").replace(";", "");
213 if (! word
.contains("U.S.A")){
214 word
= word
.replace(",", "").replace(".", "").replace(";", "");
216 word
= word
.replace(",", "").replace(";", "");
220 if (word
.endsWith("Is")){
223 if (stopWords
.size() == 0){
227 word
= word
.replace("Russia [North European territory]", "North European Russia");
228 word
= word
.replace("Russia North European territory", "North European Russia");
229 word
= word
.replace("Russia: North European territory", "North European Russia");
230 word
= word
.replace("Russia: North European territory", "North European Russia");
232 word
= word
.replace("Amber", "amber");
235 word
= word
.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
236 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
237 word
= word
.replace("Bahama Is.", "Bahamas");
238 word
= word
.replace("Comores Is.", "Comoros");
239 word
= word
.replace("former Yugoslavia", "Yugoslavia");
240 word
= word
.replace("former Czechoslovakia", "Czechoslovakia");
241 word
= word
.replace("Rhodesia", "Zimbabwe");
242 if (!word
.contains("El Salvador")){
243 word
= word
.replace("Salvador", "El Salvador");
245 word
= word
.replace("Vera Cruz", "Veracruz");
246 word
= word
.replace("Turkmenia", "Turkmenistan");
247 word
= word
.replace("Québeck", "Québec");
248 word
= word
.replace("Quebeck", "Québec");
249 word
= word
.replace("Quebec", "Québec");
250 //word = word.replace("Quebec", "Qu+®bec");
251 //word = word.replace("Quebec", "Qu├®bec");
253 word
= word
.replace("Gambia", "Gambia, The");
254 word
= word
.replace("Mariana Is.", "Marianas");
255 word
= word
.replace("Kenia", "Kenya");
256 word
= word
.replace("Central Africa", "Central African Republic");
257 word
= word
.replace("Canal Zone", "");
258 //word = word.replace("Panama", "Panamá");
259 word
= word
.replace("Panama", "Panamá");
260 if (! word
.contains("New South Wales")){
261 word
= word
.replace("Wales", "Great Britain");
263 word
= word
.replace("Java", "Jawa");
264 word
= word
.replace("former USSR: North European territory", "North European Russia");
265 word
= word
.replace("former USSR: South European territory", "South European Russia");
266 word
= word
.replace("former USSR: Soviet Middle Asia", "Middle Asia");
268 word
= word
.replace("oceanian islands", "Pacific");
269 word
= word
.replace("Ussuri region", "Primorye");
270 word
= word
.replace("Galapagos Is.", "Galápagos");
271 if (! word
.contains("Is.")){
272 word
= word
.replace("Galapagos", "Galápagos");
275 //word = word.replace("Galapagos Is.", "Galápagos");
276 if (! word
.contains("Peninsular")){
277 word
= word
.replace("Malaysia", "Peninsular Malaysia");
279 word
= word
.replace("Polynesic Is.", "South Solomons");
281 word
= word
.replace("Usbek SSR", "Uzbekistan");
282 word
= word
.replace("Mexican amber", "Mexico");
283 word
= word
.replace("Marocco", "Morocco");
284 if (! word
.contains("Tobago")){
285 word
= word
.replace("Trinidad", "Trinidad-Tobago");
287 if (! word
.contains("Trinidad")){
288 word
= word
.replace("Tobago", "Trinidad-Tobago");
290 word
= word
.replace("Haiti", "Haiti");
291 word
= word
.replace("Moluccas", "Maluku");
292 word
= word
.replace("Belau", "Palau");
293 word
= word
.replace("Dominican amber", "Dominican Republic");
294 if (! word
.contains("Russian")){
295 word
= word
.replace("Far East", "Russian Far East");
297 word
= word
.replace("Tahiti", "Society Is.");
298 word
= word
.replace("Iraque", "Iraq");
299 word
= word
.replace("Wake Island", "Wake I.");
300 if (! word
.contains("I.")){
301 word
= word
.replace("Johnston I", "Johnston I.");
302 word
= word
.replace("Wake I", "Wake I.");
303 word
= word
.replace("Clipperton I", "Clipperton I.");
305 if (! word
.contains("Provinces")){
306 word
= word
.replace("Cape Province", "Cape Provinces");
308 word
= word
.replace("Eastern Cape Provinces", "Eastern Cape Province");
309 if (! word
.contains("Barbuda")){
310 word
= word
.replace("Antigua", "Antigua-Barbuda");
312 if (! word
.contains("St.")){
313 word
= word
.replace("St Vincent", "St.Vincent");
314 word
= word
.replace("St Lucia", "St.Lucia");
315 word
= word
.replace("St Helena", "St.Helena");
317 word
= word
.replace("Asia-tropical", "Asia-Tropical");
318 word
= word
.replace("Society Islands", "Society Is.");
319 word
= word
.replace("Virgin Islands", "Virgin Is.");
320 word
= word
.replace("Canary Islands", "Canary Is.");
321 word
= word
.replace("Rhode Island", "Rhode I.");
324 word
= word
.replace("Rodriguez", "Rodrigues");
325 word
= word
.replace("British Colombia", "British Columbia");
326 word
= word
.replace("Bermudas", "Bermuda");
327 word
= word
.replace("Tunesia", "Tunisia");
328 word
= word
.replace("Santos São Paulo", "São Paulo");
329 word
= word
.replace("Transvaal", "Northern Provinces");
330 word
= word
.replace("Tucumán", "Tucuman");
333 // unknownAreas.add("Baltic amber");
334 // unknownAreas.add("Arabia");
336 for (String stopWord
: stopWords
){
337 if (stopWord
.equals(word
)){
338 System
.out
.println(" STOP: " + word
);
342 for (String unknownArea
: unknownAreas
){
343 if (unknownArea
.equals(word
)){
344 System
.out
.println(" UNKNOWN: " + word
);
348 for (String higherArea
: higherAreas
){
349 if (higherArea
.equals(word
)){
359 private void initStopWords(){
360 stopWords
.add("and");
362 stopWords
.add("Is.");
363 stopWords
.add("Islands");
364 stopWords
.add("Island");
367 stopWords
.add("areas");
368 stopWords
.add("USA");
369 stopWords
.add("Australia"); //except for Australia only
370 stopWords
.add("Argentina");
372 //unknownAreas.add("Panama");
373 unknownAreas
.add("South Africa");
374 unknownAreas
.add("Chile");
376 unknownAreas
.add("Baltic amber");
377 unknownAreas
.add("Arabia");
380 higherAreas
.add("AF");
381 higherAreas
.add("OR");
382 higherAreas
.add("PA");
383 higherAreas
.add("AU");
384 higherAreas
.add("NE");
386 higherAreas
.add("NT");
393 public static void main(String
[] args
) {
394 ICdmDataSource cdmDestination
= CdmDestinations
.cdm_test_andreasM2();
395 CdmApplicationController app
= null;
397 DbSchemaValidation val
= DbSchemaValidation
.UPDATE
;
398 app
= CdmApplicationController
.NewInstance(cdmDestination
, val
);
399 } catch (DataSourceNotFoundException e
) {
401 } catch (TermNotFoundException e
) {
404 DipteraDistributionParser dipDist
= new DipteraDistributionParser();
406 dipDist
.doDistribution(app
);
408 logger
.warn("No Application Context");