cleanup and add AizoaceaeIdentifierActivator
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / app / wp6 / diptera / DipteraDistributionParser.java
index 4ab95169b1ef46b15bf897c50e58a97bbc42a863..8d6d3dcd9c791c613f3a582cbbb9b5b4c2867e93 100644 (file)
-/**\r
-* Copyright (C) 2007 EDIT\r
-* European Distributed Institute of Taxonomy \r
-* http://www.e-taxonomy.eu\r
-* \r
-* The contents of this file are subject to the Mozilla Public License Version 1.1\r
-* See LICENSE.TXT at the top of this package for the full license terms.\r
-*/\r
-\r
-/**\r
-* Copyright (C) 2007 EDIT\r
-* European Distributed Institute of Taxonomy \r
-* http://www.e-taxonomy.eu\r
-* \r
-* The contents of this file are subject to the Mozilla Public License Version 1.1\r
-* See LICENSE.TXT at the top of this package for the full license terms.\r
-*/\r
-package eu.etaxonomy.cdm.app.wp6.diptera;\r
-\r
-import java.util.ArrayList;\r
-import java.util.HashSet;\r
-import java.util.List;\r
-import java.util.Set;\r
-import java.util.regex.Pattern;\r
-\r
-import org.apache.log4j.Logger;\r
-import org.springframework.transaction.TransactionStatus;\r
-\r
-import eu.etaxonomy.cdm.api.application.CdmApplicationController;\r
-import eu.etaxonomy.cdm.app.common.CdmDestinations;\r
-import eu.etaxonomy.cdm.database.DbSchemaValidation;\r
-import eu.etaxonomy.cdm.database.ICdmDataSource;\r
-import eu.etaxonomy.cdm.model.common.Language;\r
-import eu.etaxonomy.cdm.model.description.DescriptionBase;\r
-import eu.etaxonomy.cdm.model.description.DescriptionElementBase;\r
-import eu.etaxonomy.cdm.model.description.Distribution;\r
-import eu.etaxonomy.cdm.model.description.Feature;\r
-import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;\r
-import eu.etaxonomy.cdm.model.description.PresenceTerm;\r
-import eu.etaxonomy.cdm.model.description.TaxonDescription;\r
-import eu.etaxonomy.cdm.model.description.TextData;\r
-import eu.etaxonomy.cdm.model.location.NamedArea;\r
-import eu.etaxonomy.cdm.model.location.TdwgArea;\r
-import eu.etaxonomy.cdm.model.taxon.Taxon;\r
-import eu.etaxonomy.cdm.model.taxon.TaxonBase;\r
-\r
-/**\r
- * @author a.mueller\r
- * @created 17.10.2008\r
- * @version 1.0\r
- */\r
-public class DipteraDistributionParser {\r
-       private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);\r
-       \r
-       private static ICdmDataSource cdmDestination = CdmDestinations.cdm_edit_diptera_a();\r
-\r
-       final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )\r
-       static Pattern pattern = null;\r
-       \r
-       protected void doDistribution(CdmApplicationController app){\r
-               pattern = Pattern.compile(epiSplitter); \r
-           TransactionStatus txStatus = app.startTransaction();\r
-               List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);\r
-               for (TaxonBase taxon: taxa ){\r
-                       if (taxon instanceof Taxon){\r
-               //              unlazyDescription(app, (Taxon)taxon);\r
-                               Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();\r
-                               for (DescriptionBase description: descriptions){\r
-                                       Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();\r
-                                       descElements.addAll(description.getElements());\r
-                                       \r
-                                       for (DescriptionElementBase descEl: descElements){\r
-                                               if (descEl.getFeature().equals(Feature.OCCURRENCE())){\r
-                                                       if (descEl instanceof TextData){\r
-                                                               String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
-                                                               parseOccurenceString(occString, description);\r
-                                                               //app.getTaxonService().saveTaxon(taxon);\r
-                                                       }\r
-                                               }\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-               System.out.println("Unknowns: ");\r
-               for (String unknown: unrekognizedStrings){\r
-                       System.out.println(unknown);\r
-               }\r
-               System.out.println("Distributions not recognized: " + countNot);\r
-               System.out.println("Distributions created: " + countYes);\r
-               app.commitTransaction(txStatus);\r
-       }\r
-       \r
-       static Set<String> unrekognizedStrings = new HashSet<String>();\r
-       static int countNot = 0;\r
-       static int countYes = 0;\r
-       \r
-       private void parseOccurenceString(String occString, DescriptionBase desc){\r
-               System.out.println(occString);\r
-               if (occString != null){\r
-                       String[] words = pattern.split(occString);\r
-                       int i = 0;\r
-                       int countSkip = 0;\r
-                       for (String word: words){\r
-                               if (word.contains("U.S.A")){\r
-                                       logger.warn("U.S.A.");\r
-                               }\r
-                               boolean isDoubtful = false;\r
-                               if (countSkip > 0){\r
-                                       countSkip--;\r
-                               }else if(word.trim().length() == 0){\r
-                                       //skip\r
-                               }else{\r
-                                       if (word.endsWith(":") && word.length()<=4){\r
-                                               //Higher area\r
-                                               //TODO\r
-                                       }else{\r
-                                               word = word.trim();\r
-                                               if (word.contains("?")){\r
-                                                       isDoubtful = true;\r
-                                                       word = word.replace("?", "");\r
-                                               }\r
-                                               word = adaptWordsToTdwg(word);\r
-                                               \r
-                                               if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
-                                                       for (countSkip = 1; countSkip <= 6; countSkip++){\r
-                                                               word = word.trim();\r
-                                                               if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
-                                                                       if (words.length > i + countSkip){\r
-                                                                               word = word + " " + words[i + countSkip];\r
-                                                                       }\r
-                                                                       if (word.contains("?")){\r
-                                                                               isDoubtful = true;\r
-                                                                               word = word.replace("?", "");\r
-                                                                       }\r
-                                                                       word = adaptWordsToTdwg(word);\r
-                                                                       if ("".equals(word)){\r
-                                                                               break;\r
-                                                                       }\r
-                                                               }else{\r
-                                                                       break;\r
-                                                               }\r
-                                                       }\r
-                                               }\r
-                                               if ("".equals(word)){\r
-                                                       //countSkip = countSkip;\r
-                                               }else if (! TdwgArea.isTdwgAreaLabel(word)  && ! TdwgArea.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){\r
-                                                       if (word.contains("?")){\r
-                                                               logger.warn("XXX");\r
-                                                       }\r
-                                                       countNot++;\r
-                                                       System.out.println("   False:" + countNot + ": " + word);\r
-                                                       unrekognizedStrings.add(word);\r
-                                                       countSkip = 0;\r
-                                               }else{\r
-                                                       if (word.equals("Netherlands")){\r
-                                                               if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){\r
-                                                                       word = "Netherlands Antilles";\r
-                                                                       countSkip=2;\r
-                                                               }\r
-                                                       }\r
-                                                       PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();\r
-                                                       if (isDoubleArea(word)){\r
-                                                               NamedArea[] doubleArea = getDoubleArea(word);\r
-                                                               for (NamedArea area : doubleArea){\r
-                                                                       Distribution distr = Distribution.NewInstance(area, term);\r
-                                                                       desc.addElement(distr);\r
-                                                               }\r
-                                                       }else{\r
-                                                               NamedArea area;\r
-                                                               if (TdwgArea.isTdwgAreaLabel(word)){\r
-                                                                       area = TdwgArea.getAreaByTdwgLabel(word);\r
-                                                               }else{\r
-                                                                       area = TdwgArea.getAreaByTdwgAbbreviation(word);\r
-                                                               }\r
-                                                               if (isDoubtful){\r
-                                                                       term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
-                                                               }\r
-                                                               Distribution distr = Distribution.NewInstance(area, term);\r
-                                                               desc.addElement(distr);\r
-                                                       }\r
-                                                       countYes++;\r
-                                                       System.out.println("      True:" + countYes + ": " + word);\r
-                                                       countSkip--;\r
-                                               }\r
-                                       }\r
-                               }\r
-                               i++;\r
-                       }\r
-               }\r
-       }\r
-       \r
-       private boolean isDoubleArea(String word){\r
-               if ("Canary and Madeira Is.".equalsIgnoreCase(word) || \r
-                               "southern Europe".equalsIgnoreCase(word) ||\r
-                               "former USSR: North and Central European territory".equalsIgnoreCase(word)\r
-                               ){\r
-                       return true;\r
-               }else{\r
-                       return false;\r
-               }\r
-       }\r
-       \r
-       private NamedArea[] getDoubleArea(String word){\r
-               NamedArea[] result = new NamedArea[2];\r
-               if ("Canary and Madeira Is.".equalsIgnoreCase(word)){\r
-                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("CNY");\r
-                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("MDR");\r
-               }else if ("southern Europe".equalsIgnoreCase(word)){\r
-                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("12");\r
-                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("13");\r
-               }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){\r
-                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("RUN-OO");\r
-                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("RUC-OO");\r
-               }else{\r
-                       logger.warn("Double area not recognized");\r
-               }\r
-               return result;\r
-       }\r
-       \r
-       \r
-       static List<String> stopWords = new ArrayList<String>();\r
-       static List<String> unknownAreas = new ArrayList<String>();\r
-       static List<String> higherAreas = new ArrayList<String>();\r
-       \r
-       private String adaptWordsToTdwg(String word){\r
-               word = word.replace(",", "").replace(";", "");\r
-               if (! word.contains("U.S.A")){\r
-                       word = word.replace(",", "").replace(".", "").replace(";", "");\r
-               }else{\r
-                       word = word.replace(",", "").replace(";", "");\r
-               }\r
-               \r
-               word = word.trim();\r
-               if (word.endsWith("Is")){\r
-                       word = word + ".";\r
-               }\r
-               if (stopWords.size() == 0){\r
-                       initStopWords();\r
-               }\r
-               \r
-               word = word.replace("Russia [North European territory]", "North European Russia");\r
-               word = word.replace("Russia North European territory", "North European Russia");\r
-               word = word.replace("Russia: North European territory", "North European Russia");\r
-               word = word.replace("Russia: North European territory", "North European Russia");\r
-                               \r
-               word = word.replace("Amber", "amber");\r
-               \r
-               \r
-               word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");\r
-               //or word = word.replace("Prince Edward Is.", "Prince Edward I.");\r
-               word = word.replace("Bahama Is.", "Bahamas");\r
-               word = word.replace("Comores Is.", "Comoros");\r
-               word = word.replace("former Yugoslavia", "Yugoslavia");\r
-               word = word.replace("former Czechoslovakia", "Czechoslovakia");\r
-               word = word.replace("Rhodesia", "Zimbabwe");\r
-               word = word.replace("The Gambia", "Gambia, The");\r
-\r
-               if (!word.contains("El Salvador")){\r
-                       word = word.replace("Salvador", "El Salvador"); \r
-               }\r
-               word = word.replace("Vera Cruz", "Veracruz");\r
-               word = word.replace("Turkmenia", "Turkmenistan");\r
-               word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");\r
-               word = word.replace("Quebeck", "Qu\u00E9bec");\r
-               word = word.replace("Quebec", "Qu\u00E9bec");\r
-               \r
-               if (!word.contains("Gambia, The")){\r
-                       word = word.replace("Gambia", "Gambia, The");\r
-               }\r
-               word = word.replace("Mariana Is.", "Marianas");\r
-               word = word.replace("Kenia", "Kenya");\r
-               word = word.replace("Central Africa", "Central African Republic");\r
-               word = word.replace("Canal Zone", "");\r
-               //word = word.replace("Panama", "Panamá");\r
-               word = word.replace("Panama", "Panam\u00E1");\r
-               if (! word.contains("New South Wales")){\r
-                       word = word.replace("Wales", "Great Britain");\r
-               }\r
-               word = word.replace("Java", "Jawa");\r
-               word = word.replace("former USSR: North European territory", "North European Russia");\r
-               word = word.replace("former USSR: South European territory", "South European Russia");\r
-               word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");\r
-               \r
-               word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");\r
-               \r
-               word = word.replace("oceanian islands", "Pacific");\r
-               word = word.replace("Ussuri region", "Primorye");\r
-               word = word.replace("Galapagos Is.", "Gal\u00E1pagos");\r
-               word = word.replace("Tarapac\u00E1", "Tarapaca");\r
-               word = word.replace("Reunion", "R\u00E9union");\r
-               if (! word.contains("Is.")){\r
-                       word = word.replace("Galapagos", "Gal\u00E1pagos");\r
-               }\r
-               \r
-               //word = word.replace("Galapagos Is.", "Galápagos");\r
-               if (! word.contains("Peninsular")){\r
-                       word = word.replace("Malaysia", "Peninsular Malaysia");\r
-               }\r
-               word = word.replace("Polynesic Is.", "South Solomons");\r
-               \r
-               word = word.replace("Usbek SSR", "Uzbekistan");\r
-               word = word.replace("Mexican amber", "Mexico");\r
-               word = word.replace("Marocco", "Morocco");\r
-               if (! word.contains("Tobago")){\r
-                       word = word.replace("Trinidad", "Trinidad-Tobago");\r
-               }\r
-               if (! word.contains("Trinidad")){\r
-                       word = word.replace("Tobago", "Trinidad-Tobago");\r
-               }\r
-               word = word.replace("Haiti", "Haiti");  \r
-               word = word.replace("Moluccas", "Maluku");\r
-               word = word.replace("Belau", "Palau");\r
-               word = word.replace("Dominican amber", "Dominican Republic");\r
-               if (! word.contains("Russian")){\r
-                       word = word.replace("Far East", "Russian Far East");\r
-               }\r
-               word = word.replace("Tahiti", "Society Is.");\r
-               word = word.replace("Iraque", "Iraq");\r
-               word = word.replace("Wake Island", "Wake I.");\r
-               if (! word.contains("I.")){\r
-                       word = word.replace("Johnston I", "Johnston I.");\r
-                       word = word.replace("Wake I", "Wake I.");\r
-                       word = word.replace("Clipperton I", "Clipperton I.");\r
-               }\r
-               if (! word.contains("Provinces")){\r
-                       word = word.replace("Cape Province", "Cape Provinces");\r
-               }\r
-               word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");\r
-               word = word.replace("Western Cape Provinces", "Western Cape Province");\r
-               if (! word.contains("Barbuda")){\r
-                       word = word.replace("Antigua", "Antigua-Barbuda");\r
-               }\r
-               if (! word.contains("St.")){\r
-                       word = word.replace("St Vincent", "St.Vincent");\r
-                       word = word.replace("St Lucia", "St.Lucia");\r
-                       word = word.replace("St Helena", "St.Helena");\r
-               }\r
-               word = word.replace("Asia-tropical", "Asia-Tropical");\r
-               word = word.replace("Society Islands", "Society Is.");\r
-               word = word.replace("Virgin Islands", "Virgin Is.");\r
-               word = word.replace("Canary Islands", "Canary Is.");\r
-               word = word.replace("Rhode Island", "Rhode I.");\r
-               \r
-               \r
-               word = word.replace("Rodriguez", "Rodrigues");\r
-               word = word.replace("British Colombia", "British Columbia");\r
-               word = word.replace("Bermudas", "Bermuda");\r
-               word = word.replace("Tunesia", "Tunisia");\r
-               word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");\r
-               word = word.replace("Transvaal", "Northern Provinces");\r
-               word = word.replace("Tucum\u00E1n", "Tucuman");\r
-//             if (!word.contains("Netherlands")){\r
-//                     \r
-//             }\r
-               \r
-//             unknownAreas.add("Baltic amber");  \r
-//             unknownAreas.add("Arabia"); \r
-                                               \r
-               for (String stopWord : stopWords){\r
-                       if (stopWord.equals(word)){\r
-                               System.out.println("         STOP: " + word);\r
-                               return "";\r
-                       }\r
-               }\r
-               for (String unknownArea : unknownAreas){\r
-                       if (unknownArea.equals(word)){\r
-                               System.out.println("         UNKNOWN: " + word);\r
-                               return "";\r
-                       }\r
-               }\r
-               for (String higherArea : higherAreas){\r
-                       if (higherArea.equals(word)){\r
-                               return "";\r
-                       }\r
-               }\r
-               \r
-               //higher regions\r
-               \r
-               return word;\r
-       }\r
-       \r
-       private void initStopWords(){\r
-               stopWords.add("and");\r
-               stopWords.add("Is");\r
-               stopWords.add("Is.");\r
-               stopWords.add("Islands");\r
-               stopWords.add("Island");\r
-               \r
-               stopWords.add("of");\r
-               stopWords.add("areas");\r
-               stopWords.add("USA");\r
-               stopWords.add("Australia"); //except for Australia only\r
-               stopWords.add("Argentina");             \r
-\r
-               //unknownAreas.add("Panama");\r
-               unknownAreas.add("South Africa");\r
-               unknownAreas.add("Chile");\r
-\r
-               unknownAreas.add("Baltic amber");  \r
-               unknownAreas.add("Arabia"); \r
-\r
-                       \r
-               higherAreas.add("AF");\r
-               higherAreas.add("OR");\r
-               higherAreas.add("PA");\r
-               higherAreas.add("AU");\r
-               higherAreas.add("NE");\r
-               \r
-               higherAreas.add("NT");\r
-       }\r
-\r
-       \r
-       /**\r
-        * @param args\r
-        */\r
-       public static void main(String[] args) {\r
-               CdmApplicationController app = null;\r
-               DbSchemaValidation val = DbSchemaValidation.UPDATE;\r
-               app = CdmApplicationController.NewInstance(cdmDestination, val);\r
-               \r
-               DipteraDistributionParser dipDist = new DipteraDistributionParser();\r
-               if (app != null){\r
-                       dipDist.doDistribution(app);\r
-               }else{\r
-                       logger.warn("No Application Context");\r
-               }\r
-       }\r
-}\r
+/**
+* Copyright (C) 2007 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+
+/**
+* Copyright (C) 2007 EDIT
+* European Distributed Institute of Taxonomy
+* http://www.e-taxonomy.eu
+*
+* The contents of this file are subject to the Mozilla Public License Version 1.1
+* See LICENSE.TXT at the top of this package for the full license terms.
+*/
+package eu.etaxonomy.cdm.app.wp6.diptera;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.springframework.transaction.TransactionStatus;
+
+import eu.etaxonomy.cdm.api.application.CdmApplicationController;
+import eu.etaxonomy.cdm.api.application.ICdmRepository;
+import eu.etaxonomy.cdm.app.common.CdmDestinations;
+import eu.etaxonomy.cdm.database.DbSchemaValidation;
+import eu.etaxonomy.cdm.database.ICdmDataSource;
+import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
+import eu.etaxonomy.cdm.model.common.Language;
+import eu.etaxonomy.cdm.model.description.DescriptionBase;
+import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
+import eu.etaxonomy.cdm.model.description.Distribution;
+import eu.etaxonomy.cdm.model.description.Feature;
+import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
+import eu.etaxonomy.cdm.model.description.TaxonDescription;
+import eu.etaxonomy.cdm.model.description.TextData;
+import eu.etaxonomy.cdm.model.location.NamedArea;
+import eu.etaxonomy.cdm.model.taxon.Taxon;
+import eu.etaxonomy.cdm.model.taxon.TaxonBase;
+
+/**
+ * @author a.mueller
+ * @since 17.10.2008
+ */
+public class DipteraDistributionParser {
+       private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
+
+       private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
+
+       final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
+       static Pattern pattern = null;
+
+       protected void doDistribution(ICdmRepository app){
+               pattern = Pattern.compile(epiSplitter);
+           TransactionStatus txStatus = app.startTransaction();
+               List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
+               for (TaxonBase taxon: taxa ){
+                       if (taxon instanceof Taxon){
+               //              unlazyDescription(app, (Taxon)taxon);
+                               Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
+                               for (DescriptionBase description: descriptions){
+                                       Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
+                                       descElements.addAll(description.getElements());
+
+                                       for (DescriptionElementBase descEl: descElements){
+                                               if (descEl.getFeature().equals(Feature.OCCURRENCE())){
+                                                       if (descEl instanceof TextData){
+                                                               String occString = ((TextData)descEl).getText(Language.ENGLISH());
+                                                               parseOccurenceString(occString, description);
+                                                               //app.getTaxonService().saveTaxon(taxon);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               System.out.println("Unknowns: ");
+               for (String unknown: unrekognizedStrings){
+                       System.out.println(unknown);
+               }
+               System.out.println("Distributions not recognized: " + countNot);
+               System.out.println("Distributions created: " + countYes);
+               app.commitTransaction(txStatus);
+       }
+
+       static Set<String> unrekognizedStrings = new HashSet<>();
+       static int countNot = 0;
+       static int countYes = 0;
+
+       private void parseOccurenceString(String occString, DescriptionBase desc){
+               System.out.println(occString);
+               if (occString != null){
+                       String[] words = pattern.split(occString);
+                       int i = 0;
+                       int countSkip = 0;
+                       for (String word: words){
+                               if (word.contains("U.S.A")){
+                                       logger.warn("U.S.A.");
+                               }
+                               boolean isDoubtful = false;
+                               if (countSkip > 0){
+                                       countSkip--;
+                               }else if(word.trim().length() == 0){
+                                       //skip
+                               }else{
+                                       if (word.endsWith(":") && word.length()<=4){
+                                               //Higher area
+                                               //TODO
+                                       }else{
+                                               word = word.trim();
+                                               if (word.contains("?")){
+                                                       isDoubtful = true;
+                                                       word = word.replace("?", "");
+                                               }
+                                               word = adaptWordsToTdwg(word);
+
+                                               if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
+                                                       for (countSkip = 1; countSkip <= 6; countSkip++){
+                                                               word = word.trim();
+                                                               if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
+                                                                       if (words.length > i + countSkip){
+                                                                               word = word + " " + words[i + countSkip];
+                                                                       }
+                                                                       if (word.contains("?")){
+                                                                               isDoubtful = true;
+                                                                               word = word.replace("?", "");
+                                                                       }
+                                                                       word = adaptWordsToTdwg(word);
+                                                                       if ("".equals(word)){
+                                                                               break;
+                                                                       }
+                                                               }else{
+                                                                       break;
+                                                               }
+                                                       }
+                                               }
+                                               if ("".equals(word)){
+                                                       //countSkip = countSkip;
+                                               }else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
+                                                       if (word.contains("?")){
+                                                               logger.warn("XXX");
+                                                       }
+                                                       countNot++;
+                                                       System.out.println("   False:" + countNot + ": " + word);
+                                                       unrekognizedStrings.add(word);
+                                                       countSkip = 0;
+                                               }else{
+                                                       if (word.equals("Netherlands")){
+                                                               if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
+                                                                       word = "Netherlands Antilles";
+                                                                       countSkip=2;
+                                                               }
+                                                       }
+                                                       PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
+                                                       if (isDoubleArea(word)){
+                                                               NamedArea[] doubleArea = getDoubleArea(word);
+                                                               for (NamedArea area : doubleArea){
+                                                                       Distribution distr = Distribution.NewInstance(area, term);
+                                                                       desc.addElement(distr);
+                                                               }
+                                                       }else{
+                                                               NamedArea area;
+                                                               if (TdwgAreaProvider.isTdwgAreaLabel(word)){
+                                                                       area = TdwgAreaProvider.getAreaByTdwgLabel(word);
+                                                               }else{
+                                                                       area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
+                                                               }
+                                                               if (isDoubtful){
+                                                                       term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
+                                                               }
+                                                               Distribution distr = Distribution.NewInstance(area, term);
+                                                               desc.addElement(distr);
+                                                       }
+                                                       countYes++;
+                                                       System.out.println("      True:" + countYes + ": " + word);
+                                                       countSkip--;
+                                               }
+                                       }
+                               }
+                               i++;
+                       }
+               }
+       }
+
+       private boolean isDoubleArea(String word){
+               if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
+                               "southern Europe".equalsIgnoreCase(word) ||
+                               "former USSR: North and Central European territory".equalsIgnoreCase(word)
+                               ){
+                       return true;
+               }else{
+                       return false;
+               }
+       }
+
+       private NamedArea[] getDoubleArea(String word){
+               NamedArea[] result = new NamedArea[2];
+               if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
+                        result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
+                        result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
+               }else if ("southern Europe".equalsIgnoreCase(word)){
+                        result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
+                        result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
+               }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
+                        result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
+                        result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
+               }else{
+                       logger.warn("Double area not recognized");
+               }
+               return result;
+       }
+
+
+       static List<String> stopWords = new ArrayList<>();
+       static List<String> unknownAreas = new ArrayList<>();
+       static List<String> higherAreas = new ArrayList<>();
+
+       private String adaptWordsToTdwg(String word){
+               word = word.replace(",", "").replace(";", "");
+               if (! word.contains("U.S.A")){
+                       word = word.replace(",", "").replace(".", "").replace(";", "");
+               }else{
+                       word = word.replace(",", "").replace(";", "");
+               }
+
+               word = word.trim();
+               if (word.endsWith("Is")){
+                       word = word + ".";
+               }
+               if (stopWords.size() == 0){
+                       initStopWords();
+               }
+
+               word = word.replace("Russia [North European territory]", "North European Russia");
+               word = word.replace("Russia North European territory", "North European Russia");
+               word = word.replace("Russia: North European territory", "North European Russia");
+               word = word.replace("Russia: North European territory", "North European Russia");
+
+               word = word.replace("Amber", "amber");
+
+
+               word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
+               //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
+               word = word.replace("Bahama Is.", "Bahamas");
+               word = word.replace("Comores Is.", "Comoros");
+               word = word.replace("former Yugoslavia", "Yugoslavia");
+               word = word.replace("former Czechoslovakia", "Czechoslovakia");
+               word = word.replace("Rhodesia", "Zimbabwe");
+               word = word.replace("The Gambia", "Gambia, The");
+
+               if (!word.contains("El Salvador")){
+                       word = word.replace("Salvador", "El Salvador");
+               }
+               word = word.replace("Vera Cruz", "Veracruz");
+               word = word.replace("Turkmenia", "Turkmenistan");
+               word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
+               word = word.replace("Quebeck", "Qu\u00E9bec");
+               word = word.replace("Quebec", "Qu\u00E9bec");
+
+               if (!word.contains("Gambia, The")){
+                       word = word.replace("Gambia", "Gambia, The");
+               }
+               word = word.replace("Mariana Is.", "Marianas");
+               word = word.replace("Kenia", "Kenya");
+               word = word.replace("Central Africa", "Central African Republic");
+               word = word.replace("Canal Zone", "");
+               //word = word.replace("Panama", "Panamá");
+               word = word.replace("Panama", "Panam\u00E1");
+               if (! word.contains("New South Wales")){
+                       word = word.replace("Wales", "Great Britain");
+               }
+               word = word.replace("Java", "Jawa");
+               word = word.replace("former USSR: North European territory", "North European Russia");
+               word = word.replace("former USSR: South European territory", "South European Russia");
+               word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
+
+               word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
+
+               word = word.replace("oceanian islands", "Pacific");
+               word = word.replace("Ussuri region", "Primorye");
+               word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
+               word = word.replace("Tarapac\u00E1", "Tarapaca");
+               word = word.replace("Reunion", "R\u00E9union");
+               if (! word.contains("Is.")){
+                       word = word.replace("Galapagos", "Gal\u00E1pagos");
+               }
+
+               //word = word.replace("Galapagos Is.", "Galápagos");
+               if (! word.contains("Peninsular")){
+                       word = word.replace("Malaysia", "Peninsular Malaysia");
+               }
+               word = word.replace("Polynesic Is.", "South Solomons");
+
+               word = word.replace("Usbek SSR", "Uzbekistan");
+               word = word.replace("Mexican amber", "Mexico");
+               word = word.replace("Marocco", "Morocco");
+               if (! word.contains("Tobago")){
+                       word = word.replace("Trinidad", "Trinidad-Tobago");
+               }
+               if (! word.contains("Trinidad")){
+                       word = word.replace("Tobago", "Trinidad-Tobago");
+               }
+               word = word.replace("Haiti", "Haiti");
+               word = word.replace("Moluccas", "Maluku");
+               word = word.replace("Belau", "Palau");
+               word = word.replace("Dominican amber", "Dominican Republic");
+               if (! word.contains("Russian")){
+                       word = word.replace("Far East", "Russian Far East");
+               }
+               word = word.replace("Tahiti", "Society Is.");
+               word = word.replace("Iraque", "Iraq");
+               word = word.replace("Wake Island", "Wake I.");
+               if (! word.contains("I.")){
+                       word = word.replace("Johnston I", "Johnston I.");
+                       word = word.replace("Wake I", "Wake I.");
+                       word = word.replace("Clipperton I", "Clipperton I.");
+               }
+               if (! word.contains("Provinces")){
+                       word = word.replace("Cape Province", "Cape Provinces");
+               }
+               word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
+               word = word.replace("Western Cape Provinces", "Western Cape Province");
+               if (! word.contains("Barbuda")){
+                       word = word.replace("Antigua", "Antigua-Barbuda");
+               }
+               if (! word.contains("St.")){
+                       word = word.replace("St Vincent", "St.Vincent");
+                       word = word.replace("St Lucia", "St.Lucia");
+                       word = word.replace("St Helena", "St.Helena");
+               }
+               word = word.replace("Asia-tropical", "Asia-Tropical");
+               word = word.replace("Society Islands", "Society Is.");
+               word = word.replace("Virgin Islands", "Virgin Is.");
+               word = word.replace("Canary Islands", "Canary Is.");
+               word = word.replace("Rhode Island", "Rhode I.");
+
+
+               word = word.replace("Rodriguez", "Rodrigues");
+               word = word.replace("British Colombia", "British Columbia");
+               word = word.replace("Bermudas", "Bermuda");
+               word = word.replace("Tunesia", "Tunisia");
+               word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
+               word = word.replace("Transvaal", "Northern Provinces");
+               word = word.replace("Tucum\u00E1n", "Tucuman");
+//             if (!word.contains("Netherlands")){
+//
+//             }
+
+//             unknownAreas.add("Baltic amber");
+//             unknownAreas.add("Arabia");
+
+               for (String stopWord : stopWords){
+                       if (stopWord.equals(word)){
+                               System.out.println("         STOP: " + word);
+                               return "";
+                       }
+               }
+               for (String unknownArea : unknownAreas){
+                       if (unknownArea.equals(word)){
+                               System.out.println("         UNKNOWN: " + word);
+                               return "";
+                       }
+               }
+               for (String higherArea : higherAreas){
+                       if (higherArea.equals(word)){
+                               return "";
+                       }
+               }
+
+               //higher regions
+
+               return word;
+       }
+
+       private void initStopWords(){
+               stopWords.add("and");
+               stopWords.add("Is");
+               stopWords.add("Is.");
+               stopWords.add("Islands");
+               stopWords.add("Island");
+
+               stopWords.add("of");
+               stopWords.add("areas");
+               stopWords.add("USA");
+               stopWords.add("Australia"); //except for Australia only
+               stopWords.add("Argentina");
+
+               //unknownAreas.add("Panama");
+               unknownAreas.add("South Africa");
+               unknownAreas.add("Chile");
+
+               unknownAreas.add("Baltic amber");
+               unknownAreas.add("Arabia");
+
+
+               higherAreas.add("AF");
+               higherAreas.add("OR");
+               higherAreas.add("PA");
+               higherAreas.add("AU");
+               higherAreas.add("NE");
+
+               higherAreas.add("NT");
+       }
+
+       public static void main(String[] args) {
+               CdmApplicationController app = null;
+               DbSchemaValidation val = DbSchemaValidation.UPDATE;
+               app = CdmApplicationController.NewInstance(cdmDestination, val);
+
+               DipteraDistributionParser dipDist = new DipteraDistributionParser();
+               if (app != null){
+                       dipDist.doDistribution(app);
+               }else{
+                       logger.warn("No Application Context");
+               }
+       }
+}