if (descEl instanceof TextData){\r
String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
parseOccurenceString(occString, description);\r
+ //app.getTaxonService().saveTaxon(taxon);\r
}\r
}\r
}\r
int i = 0;\r
int countSkip = 0;\r
for (String word: words){\r
+ if (word.contains("U.S.A")){\r
+ logger.warn("U.S.A.");\r
+ }\r
boolean isDoubtful = false;\r
if (countSkip > 0){\r
countSkip--;\r
- }else if(word.contains("widesp") || word.equals("in")) {\r
- //skip\r
}else if(word.trim().length() == 0){\r
//skip\r
}else{\r
}\r
word = adaptWordsToTdwg(word);\r
\r
- if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+ if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
for (countSkip = 1; countSkip <= 6; countSkip++){\r
word = word.trim();\r
- if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+ if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
if (words.length > i + countSkip){\r
word = word + " " + words[i + countSkip];\r
}\r
}\r
if ("".equals(word)){\r
//countSkip = countSkip;\r
- }else if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word) ){\r
+ }else if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){\r
if (word.contains("?")){\r
logger.warn("XXX");\r
}\r
desc.addElement(distr);\r
}\r
}else{\r
- NamedArea area = TdwgArea.getAreaByTdwgLabel(word);\r
+ NamedArea area;\r
+ if (TdwgArea.isTdwgAreaLabel(word)){\r
+ area = TdwgArea.getAreaByTdwgLabel(word);\r
+ }else{\r
+ area = TdwgArea.getAreaByTdwgAbbreviation(word);\r
+ }\r
if (isDoubtful){\r
term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
}\r
static List<String> higherAreas = new ArrayList<String>();\r
\r
private String adaptWordsToTdwg(String word){\r
- word = word.replace(",", "").replace(".", "").replace(";", "");\r
- word = word.replace("Caronlina", "Carolina");\r
+ word = word.replace(",", "").replace(";", "");\r
+ if (! word.contains("U.S.A")){\r
+ word = word.replace(",", "").replace(".", "").replace(";", "");\r
+ }else{\r
+ word = word.replace(",", "").replace(";", "");\r
+ }\r
\r
word = word.trim();\r
if (word.endsWith("Is")){\r
}\r
word = word.replace("Vera Cruz", "Veracruz");\r
word = word.replace("Turkmenia", "Turkmenistan");\r
+ word = word.replace("Québeck", "Québec");\r
+ word = word.replace("Quebeck", "Québec");\r
word = word.replace("Quebec", "Québec");\r
//word = word.replace("Quebec", "Qu+®bec");\r
//word = word.replace("Quebec", "Qu├®bec");\r
word = word.replace("oceanian islands", "Pacific");\r
word = word.replace("Ussuri region", "Primorye");\r
word = word.replace("Galapagos Is.", "Galápagos");\r
+ if (! word.contains("Is.")){\r
+ word = word.replace("Galapagos", "Galápagos");\r
+ }\r
+ \r
//word = word.replace("Galapagos Is.", "Galápagos");\r
- word = word.replace("Malaysia", "Peninsular Malaysia");\r
+ if (! word.contains("Peninsular")){\r
+ word = word.replace("Malaysia", "Peninsular Malaysia");\r
+ }\r
word = word.replace("Polynesic Is.", "South Solomons");\r
\r
word = word.replace("Usbek SSR", "Uzbekistan");\r
word = word.replace("Mexican amber", "Mexico");\r
word = word.replace("Marocco", "Morocco");\r
- word = word.replace("Trinidad", "Trinidad-Tobago");\r
+ if (! word.contains("Tobago")){\r
+ word = word.replace("Trinidad", "Trinidad-Tobago");\r
+ }\r
+ if (! word.contains("Trinidad")){\r
+ word = word.replace("Tobago", "Trinidad-Tobago");\r
+ }\r
word = word.replace("Haiti", "Haiti"); \r
word = word.replace("Moluccas", "Maluku");\r
word = word.replace("Belau", "Palau");\r
word = word.replace("Dominican amber", "Dominican Republic");\r
- word = word.replace("Far East", "Russian Far East");\r
+ if (! word.contains("Russian")){\r
+ word = word.replace("Far East", "Russian Far East");\r
+ }\r
word = word.replace("Tahiti", "Society Is.");\r
+ word = word.replace("Iraque", "Iraq");\r
+ word = word.replace("Wake Island", "Wake I.");\r
+ if (! word.contains("I.")){\r
+ word = word.replace("Johnston I", "Johnston I.");\r
+ word = word.replace("Wake I", "Wake I.");\r
+ word = word.replace("Clipperton I", "Clipperton I.");\r
+ }\r
+ if (! word.contains("Provinces")){\r
+ word = word.replace("Cape Province", "Cape Provinces");\r
+ }\r
+ word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");\r
+ if (! word.contains("Barbuda")){\r
+ word = word.replace("Antigua", "Antigua-Barbuda");\r
+ }\r
+ if (! word.contains("St.")){\r
+ word = word.replace("St Vincent", "St.Vincent");\r
+ word = word.replace("St Lucia", "St.Lucia");\r
+ word = word.replace("St Helena", "St.Helena");\r
+ }\r
+ word = word.replace("Asia-tropical", "Asia-Tropical");\r
+ word = word.replace("Society Islands", "Society Is.");\r
+ word = word.replace("Virgin Islands", "Virgin Is.");\r
+ word = word.replace("Canary Islands", "Canary Is.");\r
+ word = word.replace("Rhode Island", "Rhode I.");\r
+ \r
+ \r
+ word = word.replace("Rodriguez", "Rodrigues");\r
+ word = word.replace("British Colombia", "British Columbia");\r
+ word = word.replace("Bermudas", "Bermuda");\r
+ word = word.replace("Tunesia", "Tunisia");\r
+ word = word.replace("Santos São Paulo", "São Paulo");\r
+ word = word.replace("Transvaal", "Northern Provinces");\r
+ word = word.replace("Tucumán", "Tucuman");\r
\r
\r
// unknownAreas.add("Baltic amber"); \r
}\r
\r
private void initStopWords(){\r
- stopWords.add("to");\r
- stopWords.add("also");\r
- stopWords.add("almost");\r
stopWords.add("and");\r
- stopWords.add("cosmopolitan");\r
- stopWords.add("s");\r
stopWords.add("Is");\r
stopWords.add("Is.");\r
+ stopWords.add("Islands");\r
+ stopWords.add("Island");\r
+ \r
stopWords.add("of");\r
- stopWords.add("bordering areas");\r
stopWords.add("areas");\r
stopWords.add("USA");\r
- stopWords.add("Australia"); // except for "widesp. in Australia" !!\r
- stopWords.add("&");\r
- stopWords.add("part");\r
- stopWords.add("excl");\r
-// stopWords.add("European territory"); //part of Russian distributions\r
- stopWords.add("northern part");\r
- stopWords.add("Distr:");\r
- \r
- unknownAreas.add("Argentina");\r
+ stopWords.add("Australia"); //except for Australia only\r
+ stopWords.add("Argentina"); \r
+\r
//unknownAreas.add("Panama");\r
unknownAreas.add("South Africa");\r
- unknownAreas.add("Indonesia");\r
unknownAreas.add("Chile");\r
-// unknownAreas.add("Wales");\r
-// unknownAreas.add("Java");\r
-// unknownAreas.add("former USSR: North European territory");\r
-// unknownAreas.add("former USSR: South European territory");\r
-// unknownAreas.add("former USSR: Soviet Middle Asia");\r
-// unknownAreas.add("former USSR: North and Central European territory");\r
-// unknownAreas.add("oceanian islands");\r
-// unknownAreas.add("Ussuri region");\r
-// unknownAreas.add("Galapagos Is.");\r
-// unknownAreas.add("Malaysia"); // Malaysia Peninsular exists (level 4)\r
- unknownAreas.add("West Indies"); //-> as a whole\r
-// unknownAreas.add("Canal Zone"); \r
-// unknownAreas.add("Polynesic Is."); \r
-// unknownAreas.add("Usbek SSR"); \r
-// unknownAreas.add("Mexican amber"); \r
-// unknownAreas.add("southern Europe"); // ->Southeastern Europe, Southwestern Europe\r
-// unknownAreas.add("Marocco"); \r
-// unknownAreas.add("Trinidad"); //-> Trinidad-Tobago\r
-// unknownAreas.add("Haiti"); \r
-// unknownAreas.add("Moluccas"); //-> Indonesia \r
-// unknownAreas.add("Belau"); \r
+\r
unknownAreas.add("Baltic amber"); \r
unknownAreas.add("Arabia"); \r
-// unknownAreas.add("Dominican amber"); \r
-// unknownAreas.add("Canary and Madeira Is."); //-> Canary Is. / Madeira \r
-// unknownAreas.add("Dominican amber"); \r
-// unknownAreas.add("Far East"); \r
-// unknownAreas.add("Tahiti"); \r
+\r
\r
higherAreas.add("AF");\r
higherAreas.add("OR");\r
* @param args\r
*/\r
public static void main(String[] args) {\r
- ICdmDataSource cdmDestination = CdmDestinations.localH2();\r
+ ICdmDataSource cdmDestination = CdmDestinations.cdm_test_andreasM2();\r
CdmApplicationController app = null;\r
try {\r
DbSchemaValidation val = DbSchemaValidation.UPDATE;\r