2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.app
.berlinModelImport
;
12 import java
.util
.ArrayList
;
13 import java
.util
.HashSet
;
14 import java
.util
.List
;
16 import java
.util
.regex
.Pattern
;
18 import org
.apache
.log4j
.Logger
;
19 import org
.springframework
.transaction
.TransactionStatus
;
21 import eu
.etaxonomy
.cdm
.api
.application
.CdmApplicationController
;
22 import eu
.etaxonomy
.cdm
.app
.common
.CdmDestinations
;
23 import eu
.etaxonomy
.cdm
.database
.DataSourceNotFoundException
;
24 import eu
.etaxonomy
.cdm
.database
.DbSchemaValidation
;
25 import eu
.etaxonomy
.cdm
.database
.ICdmDataSource
;
26 import eu
.etaxonomy
.cdm
.model
.common
.Language
;
27 import eu
.etaxonomy
.cdm
.model
.common
.init
.TermNotFoundException
;
28 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionBase
;
29 import eu
.etaxonomy
.cdm
.model
.description
.DescriptionElementBase
;
30 import eu
.etaxonomy
.cdm
.model
.description
.Distribution
;
31 import eu
.etaxonomy
.cdm
.model
.description
.Feature
;
32 import eu
.etaxonomy
.cdm
.model
.description
.PresenceAbsenceTermBase
;
33 import eu
.etaxonomy
.cdm
.model
.description
.PresenceTerm
;
34 import eu
.etaxonomy
.cdm
.model
.description
.TaxonDescription
;
35 import eu
.etaxonomy
.cdm
.model
.description
.TextData
;
36 import eu
.etaxonomy
.cdm
.model
.location
.NamedArea
;
37 import eu
.etaxonomy
.cdm
.model
.location
.TdwgArea
;
38 import eu
.etaxonomy
.cdm
.model
.taxon
.Taxon
;
39 import eu
.etaxonomy
.cdm
.model
.taxon
.TaxonBase
;
46 public class DipteraDistributionParser
{
47 private static final Logger logger
= Logger
.getLogger(DipteraDistributionParser
.class);
49 final static String epiSplitter
= "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
50 static Pattern pattern
= null;
52 protected void doDistribution(CdmApplicationController app
){
53 pattern
= Pattern
.compile(epiSplitter
);
54 TransactionStatus txStatus
= app
.startTransaction();
55 List
<TaxonBase
> taxa
= app
.getTaxonService().getAllTaxonBases(1000000, 0);
56 for (TaxonBase taxon
: taxa
){
57 if (taxon
instanceof Taxon
){
58 // unlazyDescription(app, (Taxon)taxon);
59 Set
<TaxonDescription
> descriptions
= ((Taxon
) taxon
).getDescriptions();
60 for (DescriptionBase description
: descriptions
){
61 Set
<DescriptionElementBase
> descElements
= new HashSet
<DescriptionElementBase
>();
62 descElements
.addAll(description
.getElements());
64 for (DescriptionElementBase descEl
: descElements
){
65 if (descEl
.getFeature().equals(Feature
.OCCURRENCE())){
66 if (descEl
instanceof TextData
){
67 String occString
= ((TextData
)descEl
).getText(Language
.ENGLISH());
68 parseOccurenceString(occString
, description
);
75 System
.out
.println("Unknowns: ");
76 for (String unknown
: unrekognizedStrings
){
77 System
.out
.println(unknown
);
79 System
.out
.println("Distributions not recognized: " + countNot
);
80 System
.out
.println("Distributions created: " + countYes
);
81 app
.commitTransaction(txStatus
);
84 static Set
<String
> unrekognizedStrings
= new HashSet
<String
>();
85 static int countNot
= 0;
86 static int countYes
= 0;
88 private void parseOccurenceString(String occString
, DescriptionBase desc
){
89 System
.out
.println(occString
);
90 if (occString
!= null){
91 String
[] words
= pattern
.split(occString
);
94 for (String word
: words
){
95 boolean isDoubtful
= false;
98 }else if(word
.contains("widesp") || word
.equals("in")) {
100 }else if(word
.trim().length() == 0){
103 if (word
.endsWith(":") && word
.length()<=4){
108 if (word
.contains("?")){
110 word
= word
.replace("?", "");
112 word
= adaptWordsToTdwg(word
);
114 if (! "".equals(word
) && ! TdwgArea
.isTdwgAreaLabel(word
) && ! isDoubleArea(word
)){
115 for (countSkip
= 1; countSkip
<= 6; countSkip
++){
117 if (! TdwgArea
.isTdwgAreaLabel(word
) && ! isDoubleArea(word
)){
118 if (words
.length
> i
+ countSkip
){
119 word
= word
+ " " + words
[i
+ countSkip
];
121 if (word
.contains("?")){
123 word
= word
.replace("?", "");
125 word
= adaptWordsToTdwg(word
);
126 if ("".equals(word
)){
134 if ("".equals(word
)){
135 //countSkip = countSkip;
136 }else if (! TdwgArea
.isTdwgAreaLabel(word
) && ! isDoubleArea(word
) ){
137 if (word
.contains("?")){
141 System
.out
.println(" False:" + countNot
+ ": " + word
);
142 unrekognizedStrings
.add(word
);
145 PresenceAbsenceTermBase
<?
> term
= PresenceTerm
.PRESENT();
146 if (isDoubleArea(word
)){
147 NamedArea
[] doubleArea
= getDoubleArea(word
);
148 for (NamedArea area
: doubleArea
){
149 Distribution distr
= Distribution
.NewInstance(area
, term
);
150 desc
.addElement(distr
);
153 NamedArea area
= TdwgArea
.getAreaByTdwgLabel(word
);
155 term
= PresenceTerm
.INTRODUCED_PRESENCE_QUESTIONABLE();
157 Distribution distr
= Distribution
.NewInstance(area
, term
);
158 desc
.addElement(distr
);
161 System
.out
.println(" True:" + countYes
+ ": " + word
);
171 private boolean isDoubleArea(String word
){
172 if ("Canary and Madeira Is.".equalsIgnoreCase(word
) ||
173 "southern Europe".equalsIgnoreCase(word
) ||
174 "former USSR: North and Central European territory".equalsIgnoreCase(word
)
182 private NamedArea
[] getDoubleArea(String word
){
183 NamedArea
[] result
= new NamedArea
[2];
184 if ("Canary and Madeira Is.".equalsIgnoreCase(word
)){
185 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("CNY");
186 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("MDR");
187 }else if ("southern Europe".equalsIgnoreCase(word
)){
188 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("12");
189 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("13");
190 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word
)){
191 result
[0] = TdwgArea
.getAreaByTdwgAbbreviation("RUN-OO");
192 result
[1] = TdwgArea
.getAreaByTdwgAbbreviation("RUC-OO");
194 logger
.warn("Double area not recognized");
200 static List
<String
> stopWords
= new ArrayList
<String
>();
201 static List
<String
> unknownAreas
= new ArrayList
<String
>();
202 static List
<String
> higherAreas
= new ArrayList
<String
>();
204 private String
adaptWordsToTdwg(String word
){
205 word
= word
.replace(",", "").replace(".", "").replace(";", "");
206 word
= word
.replace("Caronlina", "Carolina");
209 if (word
.endsWith("Is")){
212 if (stopWords
.size() == 0){
216 word
= word
.replace("Russia [North European territory]", "North European Russia");
217 word
= word
.replace("Russia North European territory", "North European Russia");
218 word
= word
.replace("Russia: North European territory", "North European Russia");
219 word
= word
.replace("Russia: North European territory", "North European Russia");
221 word
= word
.replace("Amber", "amber");
224 word
= word
.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
225 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
226 word
= word
.replace("Bahama Is.", "Bahamas");
227 word
= word
.replace("Comores Is.", "Comoros");
228 word
= word
.replace("former Yugoslavia", "Yugoslavia");
229 word
= word
.replace("former Czechoslovakia", "Czechoslovakia");
230 word
= word
.replace("Rhodesia", "Zimbabwe");
231 if (!word
.contains("El Salvador")){
232 word
= word
.replace("Salvador", "El Salvador");
234 word
= word
.replace("Vera Cruz", "Veracruz");
235 word
= word
.replace("Turkmenia", "Turkmenistan");
236 word
= word
.replace("Quebec", "Québec");
237 //word = word.replace("Quebec", "Qu+®bec");
238 //word = word.replace("Quebec", "Qu├®bec");
240 word
= word
.replace("Gambia", "Gambia, The");
241 word
= word
.replace("Mariana Is.", "Marianas");
242 word
= word
.replace("Kenia", "Kenya");
243 word
= word
.replace("Central Africa", "Central African Republic");
244 word
= word
.replace("Canal Zone", "");
245 //word = word.replace("Panama", "Panamá");
246 word
= word
.replace("Panama", "Panamá");
247 if (! word
.contains("New South Wales")){
248 word
= word
.replace("Wales", "Great Britain");
250 word
= word
.replace("Java", "Jawa");
251 word
= word
.replace("former USSR: North European territory", "North European Russia");
252 word
= word
.replace("former USSR: South European territory", "South European Russia");
253 word
= word
.replace("former USSR: Soviet Middle Asia", "Middle Asia");
255 word
= word
.replace("oceanian islands", "Pacific");
256 word
= word
.replace("Ussuri region", "Primorye");
257 word
= word
.replace("Galapagos Is.", "Galápagos");
258 //word = word.replace("Galapagos Is.", "Galápagos");
259 word
= word
.replace("Malaysia", "Peninsular Malaysia");
260 word
= word
.replace("Polynesic Is.", "South Solomons");
262 word
= word
.replace("Usbek SSR", "Uzbekistan");
263 word
= word
.replace("Mexican amber", "Mexico");
264 word
= word
.replace("Marocco", "Morocco");
265 word
= word
.replace("Trinidad", "Trinidad-Tobago");
266 word
= word
.replace("Haiti", "Haiti");
267 word
= word
.replace("Moluccas", "Maluku");
268 word
= word
.replace("Belau", "Palau");
269 word
= word
.replace("Dominican amber", "Dominican Republic");
270 word
= word
.replace("Far East", "Russian Far East");
271 word
= word
.replace("Tahiti", "Society Is.");
274 // unknownAreas.add("Baltic amber");
275 // unknownAreas.add("Arabia");
277 for (String stopWord
: stopWords
){
278 if (stopWord
.equals(word
)){
279 System
.out
.println(" STOP: " + word
);
283 for (String unknownArea
: unknownAreas
){
284 if (unknownArea
.equals(word
)){
285 System
.out
.println(" UNKNOWN: " + word
);
289 for (String higherArea
: higherAreas
){
290 if (higherArea
.equals(word
)){
300 private void initStopWords(){
302 stopWords
.add("also");
303 stopWords
.add("almost");
304 stopWords
.add("and");
305 stopWords
.add("cosmopolitan");
308 stopWords
.add("Is.");
310 stopWords
.add("bordering areas");
311 stopWords
.add("areas");
312 stopWords
.add("USA");
313 stopWords
.add("Australia"); // except for "widesp. in Australia" !!
315 stopWords
.add("part");
316 stopWords
.add("excl");
317 // stopWords.add("European territory"); //part of Russian distributions
318 stopWords
.add("northern part");
319 stopWords
.add("Distr:");
321 unknownAreas
.add("Argentina");
322 //unknownAreas.add("Panama");
323 unknownAreas
.add("South Africa");
324 unknownAreas
.add("Indonesia");
325 unknownAreas
.add("Chile");
326 // unknownAreas.add("Wales");
327 // unknownAreas.add("Java");
328 // unknownAreas.add("former USSR: North European territory");
329 // unknownAreas.add("former USSR: South European territory");
330 // unknownAreas.add("former USSR: Soviet Middle Asia");
331 // unknownAreas.add("former USSR: North and Central European territory");
332 // unknownAreas.add("oceanian islands");
333 // unknownAreas.add("Ussuri region");
334 // unknownAreas.add("Galapagos Is.");
335 // unknownAreas.add("Malaysia"); // Malaysia Peninsular exists (level 4)
336 unknownAreas
.add("West Indies"); //-> as a whole
337 // unknownAreas.add("Canal Zone");
338 // unknownAreas.add("Polynesic Is.");
339 // unknownAreas.add("Usbek SSR");
340 // unknownAreas.add("Mexican amber");
341 // unknownAreas.add("southern Europe"); // ->Southeastern Europe, Southwestern Europe
342 // unknownAreas.add("Marocco");
343 // unknownAreas.add("Trinidad"); //-> Trinidad-Tobago
344 // unknownAreas.add("Haiti");
345 // unknownAreas.add("Moluccas"); //-> Indonesia
346 // unknownAreas.add("Belau");
347 unknownAreas
.add("Baltic amber");
348 unknownAreas
.add("Arabia");
349 // unknownAreas.add("Dominican amber");
350 // unknownAreas.add("Canary and Madeira Is."); //-> Canary Is. / Madeira
351 // unknownAreas.add("Dominican amber");
352 // unknownAreas.add("Far East");
353 // unknownAreas.add("Tahiti");
355 higherAreas
.add("AF");
356 higherAreas
.add("OR");
357 higherAreas
.add("PA");
358 higherAreas
.add("AU");
359 higherAreas
.add("NE");
361 higherAreas
.add("NT");
368 public static void main(String
[] args
) {
369 ICdmDataSource cdmDestination
= CdmDestinations
.localH2();
370 CdmApplicationController app
= null;
372 DbSchemaValidation val
= DbSchemaValidation
.UPDATE
;
373 app
= CdmApplicationController
.NewInstance(cdmDestination
, val
);
374 } catch (DataSourceNotFoundException e
) {
376 } catch (TermNotFoundException e
) {
379 DipteraDistributionParser dipDist
= new DipteraDistributionParser();
381 dipDist
.doDistribution(app
);
383 logger
.warn("No Application Context");