-/**\r
-* Copyright (C) 2007 EDIT\r
-* European Distributed Institute of Taxonomy \r
-* http://www.e-taxonomy.eu\r
-* \r
-* The contents of this file are subject to the Mozilla Public License Version 1.1\r
-* See LICENSE.TXT at the top of this package for the full license terms.\r
-*/\r
-\r
-/**\r
-* Copyright (C) 2007 EDIT\r
-* European Distributed Institute of Taxonomy \r
-* http://www.e-taxonomy.eu\r
-* \r
-* The contents of this file are subject to the Mozilla Public License Version 1.1\r
-* See LICENSE.TXT at the top of this package for the full license terms.\r
-*/\r
-package eu.etaxonomy.cdm.app.wp6.diptera;\r
-\r
-import java.util.ArrayList;\r
-import java.util.HashSet;\r
-import java.util.List;\r
-import java.util.Set;\r
-import java.util.regex.Pattern;\r
-\r
-import org.apache.log4j.Logger;\r
-import org.springframework.transaction.TransactionStatus;\r
-\r
-import eu.etaxonomy.cdm.api.application.CdmApplicationController;\r
-import eu.etaxonomy.cdm.api.application.ICdmApplicationConfiguration;\r
-import eu.etaxonomy.cdm.app.common.CdmDestinations;\r
-import eu.etaxonomy.cdm.database.DbSchemaValidation;\r
-import eu.etaxonomy.cdm.database.ICdmDataSource;\r
-import eu.etaxonomy.cdm.model.common.Language;\r
-import eu.etaxonomy.cdm.model.description.DescriptionBase;\r
-import eu.etaxonomy.cdm.model.description.DescriptionElementBase;\r
-import eu.etaxonomy.cdm.model.description.Distribution;\r
-import eu.etaxonomy.cdm.model.description.Feature;\r
-import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;\r
-import eu.etaxonomy.cdm.model.description.PresenceTerm;\r
-import eu.etaxonomy.cdm.model.description.TaxonDescription;\r
-import eu.etaxonomy.cdm.model.description.TextData;\r
-import eu.etaxonomy.cdm.model.location.NamedArea;\r
-import eu.etaxonomy.cdm.model.location.TdwgArea;\r
-import eu.etaxonomy.cdm.model.taxon.Taxon;\r
-import eu.etaxonomy.cdm.model.taxon.TaxonBase;\r
-\r
-/**\r
- * @author a.mueller\r
- * @created 17.10.2008\r
- * @version 1.0\r
- */\r
-public class DipteraDistributionParser {\r
- private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);\r
- \r
- private static ICdmDataSource cdmDestination = CdmDestinations.cdm_edit_diptera_preview_B();\r
-\r
- final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )\r
- static Pattern pattern = null;\r
- \r
- protected void doDistribution(ICdmApplicationConfiguration app){\r
- pattern = Pattern.compile(epiSplitter); \r
- TransactionStatus txStatus = app.startTransaction();\r
- List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);\r
- for (TaxonBase taxon: taxa ){\r
- if (taxon instanceof Taxon){\r
- // unlazyDescription(app, (Taxon)taxon);\r
- Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();\r
- for (DescriptionBase description: descriptions){\r
- Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();\r
- descElements.addAll(description.getElements());\r
- \r
- for (DescriptionElementBase descEl: descElements){\r
- if (descEl.getFeature().equals(Feature.OCCURRENCE())){\r
- if (descEl instanceof TextData){\r
- String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
- parseOccurenceString(occString, description);\r
- //app.getTaxonService().saveTaxon(taxon);\r
- }\r
- }\r
- }\r
- }\r
- }\r
- }\r
- System.out.println("Unknowns: ");\r
- for (String unknown: unrekognizedStrings){\r
- System.out.println(unknown);\r
- }\r
- System.out.println("Distributions not recognized: " + countNot);\r
- System.out.println("Distributions created: " + countYes);\r
- app.commitTransaction(txStatus);\r
- }\r
- \r
- static Set<String> unrekognizedStrings = new HashSet<String>();\r
- static int countNot = 0;\r
- static int countYes = 0;\r
- \r
- private void parseOccurenceString(String occString, DescriptionBase desc){\r
- System.out.println(occString);\r
- if (occString != null){\r
- String[] words = pattern.split(occString);\r
- int i = 0;\r
- int countSkip = 0;\r
- for (String word: words){\r
- if (word.contains("U.S.A")){\r
- logger.warn("U.S.A.");\r
- }\r
- boolean isDoubtful = false;\r
- if (countSkip > 0){\r
- countSkip--;\r
- }else if(word.trim().length() == 0){\r
- //skip\r
- }else{\r
- if (word.endsWith(":") && word.length()<=4){\r
- //Higher area\r
- //TODO\r
- }else{\r
- word = word.trim();\r
- if (word.contains("?")){\r
- isDoubtful = true;\r
- word = word.replace("?", "");\r
- }\r
- word = adaptWordsToTdwg(word);\r
- \r
- if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
- for (countSkip = 1; countSkip <= 6; countSkip++){\r
- word = word.trim();\r
- if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
- if (words.length > i + countSkip){\r
- word = word + " " + words[i + countSkip];\r
- }\r
- if (word.contains("?")){\r
- isDoubtful = true;\r
- word = word.replace("?", "");\r
- }\r
- word = adaptWordsToTdwg(word);\r
- if ("".equals(word)){\r
- break;\r
- }\r
- }else{\r
- break;\r
- }\r
- }\r
- }\r
- if ("".equals(word)){\r
- //countSkip = countSkip;\r
- }else if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){\r
- if (word.contains("?")){\r
- logger.warn("XXX");\r
- }\r
- countNot++;\r
- System.out.println(" False:" + countNot + ": " + word);\r
- unrekognizedStrings.add(word);\r
- countSkip = 0;\r
- }else{\r
- if (word.equals("Netherlands")){\r
- if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){\r
- word = "Netherlands Antilles";\r
- countSkip=2;\r
- }\r
- }\r
- PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();\r
- if (isDoubleArea(word)){\r
- NamedArea[] doubleArea = getDoubleArea(word);\r
- for (NamedArea area : doubleArea){\r
- Distribution distr = Distribution.NewInstance(area, term);\r
- desc.addElement(distr);\r
- }\r
- }else{\r
- NamedArea area;\r
- if (TdwgArea.isTdwgAreaLabel(word)){\r
- area = TdwgArea.getAreaByTdwgLabel(word);\r
- }else{\r
- area = TdwgArea.getAreaByTdwgAbbreviation(word);\r
- }\r
- if (isDoubtful){\r
- term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
- }\r
- Distribution distr = Distribution.NewInstance(area, term);\r
- desc.addElement(distr);\r
- }\r
- countYes++;\r
- System.out.println(" True:" + countYes + ": " + word);\r
- countSkip--;\r
- }\r
- }\r
- }\r
- i++;\r
- }\r
- }\r
- }\r
- \r
- private boolean isDoubleArea(String word){\r
- if ("Canary and Madeira Is.".equalsIgnoreCase(word) || \r
- "southern Europe".equalsIgnoreCase(word) ||\r
- "former USSR: North and Central European territory".equalsIgnoreCase(word)\r
- ){\r
- return true;\r
- }else{\r
- return false;\r
- }\r
- }\r
- \r
- private NamedArea[] getDoubleArea(String word){\r
- NamedArea[] result = new NamedArea[2];\r
- if ("Canary and Madeira Is.".equalsIgnoreCase(word)){\r
- result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");\r
- result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");\r
- }else if ("southern Europe".equalsIgnoreCase(word)){\r
- result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");\r
- result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");\r
- }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){\r
- result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");\r
- result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");\r
- }else{\r
- logger.warn("Double area not recognized");\r
- }\r
- return result;\r
- }\r
- \r
- \r
- static List<String> stopWords = new ArrayList<String>();\r
- static List<String> unknownAreas = new ArrayList<String>();\r
- static List<String> higherAreas = new ArrayList<String>();\r
- \r
- private String adaptWordsToTdwg(String word){\r
- word = word.replace(",", "").replace(";", "");\r
- if (! word.contains("U.S.A")){\r
- word = word.replace(",", "").replace(".", "").replace(";", "");\r
- }else{\r
- word = word.replace(",", "").replace(";", "");\r
- }\r
- \r
- word = word.trim();\r
- if (word.endsWith("Is")){\r
- word = word + ".";\r
- }\r
- if (stopWords.size() == 0){\r
- initStopWords();\r
- }\r
- \r
- word = word.replace("Russia [North European territory]", "North European Russia");\r
- word = word.replace("Russia North European territory", "North European Russia");\r
- word = word.replace("Russia: North European territory", "North European Russia");\r
- word = word.replace("Russia: North European territory", "North European Russia");\r
- \r
- word = word.replace("Amber", "amber");\r
- \r
- \r
- word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");\r
- //or word = word.replace("Prince Edward Is.", "Prince Edward I.");\r
- word = word.replace("Bahama Is.", "Bahamas");\r
- word = word.replace("Comores Is.", "Comoros");\r
- word = word.replace("former Yugoslavia", "Yugoslavia");\r
- word = word.replace("former Czechoslovakia", "Czechoslovakia");\r
- word = word.replace("Rhodesia", "Zimbabwe");\r
- word = word.replace("The Gambia", "Gambia, The");\r
-\r
- if (!word.contains("El Salvador")){\r
- word = word.replace("Salvador", "El Salvador"); \r
- }\r
- word = word.replace("Vera Cruz", "Veracruz");\r
- word = word.replace("Turkmenia", "Turkmenistan");\r
- word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");\r
- word = word.replace("Quebeck", "Qu\u00E9bec");\r
- word = word.replace("Quebec", "Qu\u00E9bec");\r
- \r
- if (!word.contains("Gambia, The")){\r
- word = word.replace("Gambia", "Gambia, The");\r
- }\r
- word = word.replace("Mariana Is.", "Marianas");\r
- word = word.replace("Kenia", "Kenya");\r
- word = word.replace("Central Africa", "Central African Republic");\r
- word = word.replace("Canal Zone", "");\r
- //word = word.replace("Panama", "Panamá");\r
- word = word.replace("Panama", "Panam\u00E1");\r
- if (! word.contains("New South Wales")){\r
- word = word.replace("Wales", "Great Britain");\r
- }\r
- word = word.replace("Java", "Jawa");\r
- word = word.replace("former USSR: North European territory", "North European Russia");\r
- word = word.replace("former USSR: South European territory", "South European Russia");\r
- word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");\r
- \r
- word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");\r
- \r
- word = word.replace("oceanian islands", "Pacific");\r
- word = word.replace("Ussuri region", "Primorye");\r
- word = word.replace("Galapagos Is.", "Gal\u00E1pagos");\r
- word = word.replace("Tarapac\u00E1", "Tarapaca");\r
- word = word.replace("Reunion", "R\u00E9union");\r
- if (! word.contains("Is.")){\r
- word = word.replace("Galapagos", "Gal\u00E1pagos");\r
- }\r
- \r
- //word = word.replace("Galapagos Is.", "Galápagos");\r
- if (! word.contains("Peninsular")){\r
- word = word.replace("Malaysia", "Peninsular Malaysia");\r
- }\r
- word = word.replace("Polynesic Is.", "South Solomons");\r
- \r
- word = word.replace("Usbek SSR", "Uzbekistan");\r
- word = word.replace("Mexican amber", "Mexico");\r
- word = word.replace("Marocco", "Morocco");\r
- if (! word.contains("Tobago")){\r
- word = word.replace("Trinidad", "Trinidad-Tobago");\r
- }\r
- if (! word.contains("Trinidad")){\r
- word = word.replace("Tobago", "Trinidad-Tobago");\r
- }\r
- word = word.replace("Haiti", "Haiti"); \r
- word = word.replace("Moluccas", "Maluku");\r
- word = word.replace("Belau", "Palau");\r
- word = word.replace("Dominican amber", "Dominican Republic");\r
- if (! word.contains("Russian")){\r
- word = word.replace("Far East", "Russian Far East");\r
- }\r
- word = word.replace("Tahiti", "Society Is.");\r
- word = word.replace("Iraque", "Iraq");\r
- word = word.replace("Wake Island", "Wake I.");\r
- if (! word.contains("I.")){\r
- word = word.replace("Johnston I", "Johnston I.");\r
- word = word.replace("Wake I", "Wake I.");\r
- word = word.replace("Clipperton I", "Clipperton I.");\r
- }\r
- if (! word.contains("Provinces")){\r
- word = word.replace("Cape Province", "Cape Provinces");\r
- }\r
- word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");\r
- word = word.replace("Western Cape Provinces", "Western Cape Province");\r
- if (! word.contains("Barbuda")){\r
- word = word.replace("Antigua", "Antigua-Barbuda");\r
- }\r
- if (! word.contains("St.")){\r
- word = word.replace("St Vincent", "St.Vincent");\r
- word = word.replace("St Lucia", "St.Lucia");\r
- word = word.replace("St Helena", "St.Helena");\r
- }\r
- word = word.replace("Asia-tropical", "Asia-Tropical");\r
- word = word.replace("Society Islands", "Society Is.");\r
- word = word.replace("Virgin Islands", "Virgin Is.");\r
- word = word.replace("Canary Islands", "Canary Is.");\r
- word = word.replace("Rhode Island", "Rhode I.");\r
- \r
- \r
- word = word.replace("Rodriguez", "Rodrigues");\r
- word = word.replace("British Colombia", "British Columbia");\r
- word = word.replace("Bermudas", "Bermuda");\r
- word = word.replace("Tunesia", "Tunisia");\r
- word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");\r
- word = word.replace("Transvaal", "Northern Provinces");\r
- word = word.replace("Tucum\u00E1n", "Tucuman");\r
-// if (!word.contains("Netherlands")){\r
-// \r
-// }\r
- \r
-// unknownAreas.add("Baltic amber"); \r
-// unknownAreas.add("Arabia"); \r
- \r
- for (String stopWord : stopWords){\r
- if (stopWord.equals(word)){\r
- System.out.println(" STOP: " + word);\r
- return "";\r
- }\r
- }\r
- for (String unknownArea : unknownAreas){\r
- if (unknownArea.equals(word)){\r
- System.out.println(" UNKNOWN: " + word);\r
- return "";\r
- }\r
- }\r
- for (String higherArea : higherAreas){\r
- if (higherArea.equals(word)){\r
- return "";\r
- }\r
- }\r
- \r
- //higher regions\r
- \r
- return word;\r
- }\r
- \r
- private void initStopWords(){\r
- stopWords.add("and");\r
- stopWords.add("Is");\r
- stopWords.add("Is.");\r
- stopWords.add("Islands");\r
- stopWords.add("Island");\r
- \r
- stopWords.add("of");\r
- stopWords.add("areas");\r
- stopWords.add("USA");\r
- stopWords.add("Australia"); //except for Australia only\r
- stopWords.add("Argentina"); \r
-\r
- //unknownAreas.add("Panama");\r
- unknownAreas.add("South Africa");\r
- unknownAreas.add("Chile");\r
-\r
- unknownAreas.add("Baltic amber"); \r
- unknownAreas.add("Arabia"); \r
-\r
- \r
- higherAreas.add("AF");\r
- higherAreas.add("OR");\r
- higherAreas.add("PA");\r
- higherAreas.add("AU");\r
- higherAreas.add("NE");\r
- \r
- higherAreas.add("NT");\r
- }\r
-\r
- \r
- /**\r
- * @param args\r
- */\r
- public static void main(String[] args) {\r
- CdmApplicationController app = null;\r
- DbSchemaValidation val = DbSchemaValidation.UPDATE;\r
- app = CdmApplicationController.NewInstance(cdmDestination, val);\r
- \r
- DipteraDistributionParser dipDist = new DipteraDistributionParser();\r
- if (app != null){\r
- dipDist.doDistribution(app);\r
- }else{\r
- logger.warn("No Application Context");\r
- }\r
- }\r
-}\r
+/**
+* Copyright (C) 2007 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+
+/**
+* Copyright (C) 2007 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.app.wp6.diptera;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.springframework.transaction.TransactionStatus;
+
+import eu.etaxonomy.cdm.api.application.CdmApplicationController;
+import eu.etaxonomy.cdm.api.application.ICdmRepository;
+import eu.etaxonomy.cdm.app.common.CdmDestinations;
+import eu.etaxonomy.cdm.database.DbSchemaValidation;
+import eu.etaxonomy.cdm.database.ICdmDataSource;
+import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
+import eu.etaxonomy.cdm.model.common.Language;
+import eu.etaxonomy.cdm.model.description.DescriptionBase;
+import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
+import eu.etaxonomy.cdm.model.description.Distribution;
+import eu.etaxonomy.cdm.model.description.Feature;
+import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
+import eu.etaxonomy.cdm.model.description.TaxonDescription;
+import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.location.NamedArea;
+import eu.etaxonomy.cdm.model.taxon.Taxon;
+import eu.etaxonomy.cdm.model.taxon.TaxonBase;
+
+/**
+ * @author a.mueller
+ * @since 17.10.2008
+ * @version 1.0
+ */
+public class DipteraDistributionParser {
+ private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
+
+ private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
+
+ final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
+ static Pattern pattern = null;
+
+ protected void doDistribution(ICdmRepository app){
+ pattern = Pattern.compile(epiSplitter);
+ TransactionStatus txStatus = app.startTransaction();
+ List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
+ for (TaxonBase taxon: taxa ){
+ if (taxon instanceof Taxon){
+ // unlazyDescription(app, (Taxon)taxon);
+ Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
+ for (DescriptionBase description: descriptions){
+ Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
+ descElements.addAll(description.getElements());
+
+ for (DescriptionElementBase descEl: descElements){
+ if (descEl.getFeature().equals(Feature.OCCURRENCE())){
+ if (descEl instanceof TextData){
+ String occString = ((TextData)descEl).getText(Language.ENGLISH());
+ parseOccurenceString(occString, description);
+ //app.getTaxonService().saveTaxon(taxon);
+ }
+ }
+ }
+ }
+ }
+ }
+ System.out.println("Unknowns: ");
+ for (String unknown: unrekognizedStrings){
+ System.out.println(unknown);
+ }
+ System.out.println("Distributions not recognized: " + countNot);
+ System.out.println("Distributions created: " + countYes);
+ app.commitTransaction(txStatus);
+ }
+
+ static Set<String> unrekognizedStrings = new HashSet<String>();
+ static int countNot = 0;
+ static int countYes = 0;
+
+ private void parseOccurenceString(String occString, DescriptionBase desc){
+ System.out.println(occString);
+ if (occString != null){
+ String[] words = pattern.split(occString);
+ int i = 0;
+ int countSkip = 0;
+ for (String word: words){
+ if (word.contains("U.S.A")){
+ logger.warn("U.S.A.");
+ }
+ boolean isDoubtful = false;
+ if (countSkip > 0){
+ countSkip--;
+ }else if(word.trim().length() == 0){
+ //skip
+ }else{
+ if (word.endsWith(":") && word.length()<=4){
+ //Higher area
+ //TODO
+ }else{
+ word = word.trim();
+ if (word.contains("?")){
+ isDoubtful = true;
+ word = word.replace("?", "");
+ }
+ word = adaptWordsToTdwg(word);
+
+ if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
+ for (countSkip = 1; countSkip <= 6; countSkip++){
+ word = word.trim();
+ if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
+ if (words.length > i + countSkip){
+ word = word + " " + words[i + countSkip];
+ }
+ if (word.contains("?")){
+ isDoubtful = true;
+ word = word.replace("?", "");
+ }
+ word = adaptWordsToTdwg(word);
+ if ("".equals(word)){
+ break;
+ }
+ }else{
+ break;
+ }
+ }
+ }
+ if ("".equals(word)){
+ //countSkip = countSkip;
+ }else if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word) ){
+ if (word.contains("?")){
+ logger.warn("XXX");
+ }
+ countNot++;
+ System.out.println(" False:" + countNot + ": " + word);
+ unrekognizedStrings.add(word);
+ countSkip = 0;
+ }else{
+ if (word.equals("Netherlands")){
+ if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
+ word = "Netherlands Antilles";
+ countSkip=2;
+ }
+ }
+ PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
+ if (isDoubleArea(word)){
+ NamedArea[] doubleArea = getDoubleArea(word);
+ for (NamedArea area : doubleArea){
+ Distribution distr = Distribution.NewInstance(area, term);
+ desc.addElement(distr);
+ }
+ }else{
+ NamedArea area;
+ if (TdwgAreaProvider.isTdwgAreaLabel(word)){
+ area = TdwgAreaProvider.getAreaByTdwgLabel(word);
+ }else{
+ area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
+ }
+ if (isDoubtful){
+ term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
+ }
+ Distribution distr = Distribution.NewInstance(area, term);
+ desc.addElement(distr);
+ }
+ countYes++;
+ System.out.println(" True:" + countYes + ": " + word);
+ countSkip--;
+ }
+ }
+ }
+ i++;
+ }
+ }
+ }
+
+ private boolean isDoubleArea(String word){
+ if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
+ "southern Europe".equalsIgnoreCase(word) ||
+ "former USSR: North and Central European territory".equalsIgnoreCase(word)
+ ){
+ return true;
+ }else{
+ return false;
+ }
+ }
+
+ private NamedArea[] getDoubleArea(String word){
+ NamedArea[] result = new NamedArea[2];
+ if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
+ result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
+ result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
+ }else if ("southern Europe".equalsIgnoreCase(word)){
+ result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
+ result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
+ }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
+ result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
+ result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
+ }else{
+ logger.warn("Double area not recognized");
+ }
+ return result;
+ }
+
+
+ static List<String> stopWords = new ArrayList<String>();
+ static List<String> unknownAreas = new ArrayList<String>();
+ static List<String> higherAreas = new ArrayList<String>();
+
+ private String adaptWordsToTdwg(String word){
+ word = word.replace(",", "").replace(";", "");
+ if (! word.contains("U.S.A")){
+ word = word.replace(",", "").replace(".", "").replace(";", "");
+ }else{
+ word = word.replace(",", "").replace(";", "");
+ }
+
+ word = word.trim();
+ if (word.endsWith("Is")){
+ word = word + ".";
+ }
+ if (stopWords.size() == 0){
+ initStopWords();
+ }
+
+ word = word.replace("Russia [North European territory]", "North European Russia");
+ word = word.replace("Russia North European territory", "North European Russia");
+ word = word.replace("Russia: North European territory", "North European Russia");
+ word = word.replace("Russia: North European territory", "North European Russia");
+
+ word = word.replace("Amber", "amber");
+
+
+ word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
+ //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
+ word = word.replace("Bahama Is.", "Bahamas");
+ word = word.replace("Comores Is.", "Comoros");
+ word = word.replace("former Yugoslavia", "Yugoslavia");
+ word = word.replace("former Czechoslovakia", "Czechoslovakia");
+ word = word.replace("Rhodesia", "Zimbabwe");
+ word = word.replace("The Gambia", "Gambia, The");
+
+ if (!word.contains("El Salvador")){
+ word = word.replace("Salvador", "El Salvador");
+ }
+ word = word.replace("Vera Cruz", "Veracruz");
+ word = word.replace("Turkmenia", "Turkmenistan");
+ word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
+ word = word.replace("Quebeck", "Qu\u00E9bec");
+ word = word.replace("Quebec", "Qu\u00E9bec");
+
+ if (!word.contains("Gambia, The")){
+ word = word.replace("Gambia", "Gambia, The");
+ }
+ word = word.replace("Mariana Is.", "Marianas");
+ word = word.replace("Kenia", "Kenya");
+ word = word.replace("Central Africa", "Central African Republic");
+ word = word.replace("Canal Zone", "");
+ //word = word.replace("Panama", "Panamá");
+ word = word.replace("Panama", "Panam\u00E1");
+ if (! word.contains("New South Wales")){
+ word = word.replace("Wales", "Great Britain");
+ }
+ word = word.replace("Java", "Jawa");
+ word = word.replace("former USSR: North European territory", "North European Russia");
+ word = word.replace("former USSR: South European territory", "South European Russia");
+ word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
+
+ word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
+
+ word = word.replace("oceanian islands", "Pacific");
+ word = word.replace("Ussuri region", "Primorye");
+ word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
+ word = word.replace("Tarapac\u00E1", "Tarapaca");
+ word = word.replace("Reunion", "R\u00E9union");
+ if (! word.contains("Is.")){
+ word = word.replace("Galapagos", "Gal\u00E1pagos");
+ }
+
+ //word = word.replace("Galapagos Is.", "Galápagos");
+ if (! word.contains("Peninsular")){
+ word = word.replace("Malaysia", "Peninsular Malaysia");
+ }
+ word = word.replace("Polynesic Is.", "South Solomons");
+
+ word = word.replace("Usbek SSR", "Uzbekistan");
+ word = word.replace("Mexican amber", "Mexico");
+ word = word.replace("Marocco", "Morocco");
+ if (! word.contains("Tobago")){
+ word = word.replace("Trinidad", "Trinidad-Tobago");
+ }
+ if (! word.contains("Trinidad")){
+ word = word.replace("Tobago", "Trinidad-Tobago");
+ }
+ word = word.replace("Haiti", "Haiti");
+ word = word.replace("Moluccas", "Maluku");
+ word = word.replace("Belau", "Palau");
+ word = word.replace("Dominican amber", "Dominican Republic");
+ if (! word.contains("Russian")){
+ word = word.replace("Far East", "Russian Far East");
+ }
+ word = word.replace("Tahiti", "Society Is.");
+ word = word.replace("Iraque", "Iraq");
+ word = word.replace("Wake Island", "Wake I.");
+ if (! word.contains("I.")){
+ word = word.replace("Johnston I", "Johnston I.");
+ word = word.replace("Wake I", "Wake I.");
+ word = word.replace("Clipperton I", "Clipperton I.");
+ }
+ if (! word.contains("Provinces")){
+ word = word.replace("Cape Province", "Cape Provinces");
+ }
+ word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
+ word = word.replace("Western Cape Provinces", "Western Cape Province");
+ if (! word.contains("Barbuda")){
+ word = word.replace("Antigua", "Antigua-Barbuda");
+ }
+ if (! word.contains("St.")){
+ word = word.replace("St Vincent", "St.Vincent");
+ word = word.replace("St Lucia", "St.Lucia");
+ word = word.replace("St Helena", "St.Helena");
+ }
+ word = word.replace("Asia-tropical", "Asia-Tropical");
+ word = word.replace("Society Islands", "Society Is.");
+ word = word.replace("Virgin Islands", "Virgin Is.");
+ word = word.replace("Canary Islands", "Canary Is.");
+ word = word.replace("Rhode Island", "Rhode I.");
+
+
+ word = word.replace("Rodriguez", "Rodrigues");
+ word = word.replace("British Colombia", "British Columbia");
+ word = word.replace("Bermudas", "Bermuda");
+ word = word.replace("Tunesia", "Tunisia");
+ word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
+ word = word.replace("Transvaal", "Northern Provinces");
+ word = word.replace("Tucum\u00E1n", "Tucuman");
+// if (!word.contains("Netherlands")){
+//
+// }
+
+// unknownAreas.add("Baltic amber");
+// unknownAreas.add("Arabia");
+
+ for (String stopWord : stopWords){
+ if (stopWord.equals(word)){
+ System.out.println(" STOP: " + word);
+ return "";
+ }
+ }
+ for (String unknownArea : unknownAreas){
+ if (unknownArea.equals(word)){
+ System.out.println(" UNKNOWN: " + word);
+ return "";
+ }
+ }
+ for (String higherArea : higherAreas){
+ if (higherArea.equals(word)){
+ return "";
+ }
+ }
+
+ //higher regions
+
+ return word;
+ }
+
+ private void initStopWords(){
+ stopWords.add("and");
+ stopWords.add("Is");
+ stopWords.add("Is.");
+ stopWords.add("Islands");
+ stopWords.add("Island");
+
+ stopWords.add("of");
+ stopWords.add("areas");
+ stopWords.add("USA");
+ stopWords.add("Australia"); //except for Australia only
+ stopWords.add("Argentina");
+
+ //unknownAreas.add("Panama");
+ unknownAreas.add("South Africa");
+ unknownAreas.add("Chile");
+
+ unknownAreas.add("Baltic amber");
+ unknownAreas.add("Arabia");
+
+
+ higherAreas.add("AF");
+ higherAreas.add("OR");
+ higherAreas.add("PA");
+ higherAreas.add("AU");
+ higherAreas.add("NE");
+
+ higherAreas.add("NT");
+ }
+
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ CdmApplicationController app = null;
+ DbSchemaValidation val = DbSchemaValidation.UPDATE;
+ app = CdmApplicationController.NewInstance(cdmDestination, val);
+
+ DipteraDistributionParser dipDist = new DipteraDistributionParser();
+ if (app != null){
+ dipDist.doDistribution(app);
+ }else{
+ logger.warn("No Application Context");
+ }
+ }
+}