Diptera distribution parser
authorAndreas Müller <a.mueller@bgbm.org>
Mon, 17 Nov 2008 18:11:42 +0000 (18:11 +0000)
committerAndreas Müller <a.mueller@bgbm.org>
Mon, 17 Nov 2008 18:11:42 +0000 (18:11 +0000)
app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraDistributionParser.java

index 5ff08888b264ee1ba732dc59580580ad0f2d30a0..32b0eb3c896c3ebe9301f79e6e2a9ac56315d0b1 100644 (file)
@@ -66,6 +66,7 @@ public class DipteraDistributionParser {
                                                        if (descEl instanceof TextData){\r
                                                                String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
                                                                parseOccurenceString(occString, description);\r
+                                                               //app.getTaxonService().saveTaxon(taxon);\r
                                                        }\r
                                                }\r
                                        }\r
@@ -92,11 +93,12 @@ public class DipteraDistributionParser {
                        int i = 0;\r
                        int countSkip = 0;\r
                        for (String word: words){\r
+                               if (word.contains("U.S.A")){\r
+                                       logger.warn("U.S.A.");\r
+                               }\r
                                boolean isDoubtful = false;\r
                                if (countSkip > 0){\r
                                        countSkip--;\r
-                               }else if(word.contains("widesp") || word.equals("in")) {\r
-                                       //skip\r
                                }else if(word.trim().length() == 0){\r
                                        //skip\r
                                }else{\r
@@ -111,10 +113,10 @@ public class DipteraDistributionParser {
                                                }\r
                                                word = adaptWordsToTdwg(word);\r
                                                \r
-                                               if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+                                               if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
                                                        for (countSkip = 1; countSkip <= 6; countSkip++){\r
                                                                word = word.trim();\r
-                                                               if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+                                                               if (! TdwgArea.isTdwgAreaLabel(word) && ! TdwgArea.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){\r
                                                                        if (words.length > i + countSkip){\r
                                                                                word = word + " " + words[i + countSkip];\r
                                                                        }\r
@@ -133,7 +135,7 @@ public class DipteraDistributionParser {
                                                }\r
                                                if ("".equals(word)){\r
                                                        //countSkip = countSkip;\r
-                                               }else if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)  ){\r
+                                               }else if (! TdwgArea.isTdwgAreaLabel(word)  && ! TdwgArea.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){\r
                                                        if (word.contains("?")){\r
                                                                logger.warn("XXX");\r
                                                        }\r
@@ -150,7 +152,12 @@ public class DipteraDistributionParser {
                                                                        desc.addElement(distr);\r
                                                                }\r
                                                        }else{\r
-                                                               NamedArea area = TdwgArea.getAreaByTdwgLabel(word);\r
+                                                               NamedArea area;\r
+                                                               if (TdwgArea.isTdwgAreaLabel(word)){\r
+                                                                       area = TdwgArea.getAreaByTdwgLabel(word);\r
+                                                               }else{\r
+                                                                       area = TdwgArea.getAreaByTdwgAbbreviation(word);\r
+                                                               }\r
                                                                if (isDoubtful){\r
                                                                        term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
                                                                }\r
@@ -202,8 +209,12 @@ public class DipteraDistributionParser {
        static List<String> higherAreas = new ArrayList<String>();\r
        \r
        private String adaptWordsToTdwg(String word){\r
-               word = word.replace(",", "").replace(".", "").replace(";", "");\r
-               word = word.replace("Caronlina", "Carolina");\r
+               word = word.replace(",", "").replace(";", "");\r
+               if (! word.contains("U.S.A")){\r
+                       word = word.replace(",", "").replace(".", "").replace(";", "");\r
+               }else{\r
+                       word = word.replace(",", "").replace(";", "");\r
+               }\r
                \r
                word = word.trim();\r
                if (word.endsWith("Is")){\r
@@ -233,6 +244,8 @@ public class DipteraDistributionParser {
                }\r
                word = word.replace("Vera Cruz", "Veracruz");\r
                word = word.replace("Turkmenia", "Turkmenistan");\r
+               word = word.replace("Québeck", "Québec");\r
+               word = word.replace("Quebeck", "Québec");\r
                word = word.replace("Quebec", "Québec");\r
                //word = word.replace("Quebec", "Qu+®bec");\r
                //word = word.replace("Quebec", "Qu├®bec");\r
@@ -255,20 +268,66 @@ public class DipteraDistributionParser {
                word = word.replace("oceanian islands", "Pacific");\r
                word = word.replace("Ussuri region", "Primorye");\r
                word = word.replace("Galapagos Is.", "Galápagos");\r
+               if (! word.contains("Is.")){\r
+                       word = word.replace("Galapagos", "Galápagos");\r
+               }\r
+               \r
                //word = word.replace("Galapagos Is.", "Galápagos");\r
-               word = word.replace("Malaysia", "Peninsular Malaysia");\r
+               if (! word.contains("Peninsular")){\r
+                       word = word.replace("Malaysia", "Peninsular Malaysia");\r
+               }\r
                word = word.replace("Polynesic Is.", "South Solomons");\r
                \r
                word = word.replace("Usbek SSR", "Uzbekistan");\r
                word = word.replace("Mexican amber", "Mexico");\r
                word = word.replace("Marocco", "Morocco");\r
-               word = word.replace("Trinidad", "Trinidad-Tobago");\r
+               if (! word.contains("Tobago")){\r
+                       word = word.replace("Trinidad", "Trinidad-Tobago");\r
+               }\r
+               if (! word.contains("Trinidad")){\r
+                       word = word.replace("Tobago", "Trinidad-Tobago");\r
+               }\r
                word = word.replace("Haiti", "Haiti");  \r
                word = word.replace("Moluccas", "Maluku");\r
                word = word.replace("Belau", "Palau");\r
                word = word.replace("Dominican amber", "Dominican Republic");\r
-               word = word.replace("Far East", "Russian Far East");\r
+               if (! word.contains("Russian")){\r
+                       word = word.replace("Far East", "Russian Far East");\r
+               }\r
                word = word.replace("Tahiti", "Society Is.");\r
+               word = word.replace("Iraque", "Iraq");\r
+               word = word.replace("Wake Island", "Wake I.");\r
+               if (! word.contains("I.")){\r
+                       word = word.replace("Johnston I", "Johnston I.");\r
+                       word = word.replace("Wake I", "Wake I.");\r
+                       word = word.replace("Clipperton I", "Clipperton I.");\r
+               }\r
+               if (! word.contains("Provinces")){\r
+                       word = word.replace("Cape Province", "Cape Provinces");\r
+               }\r
+               word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");\r
+               if (! word.contains("Barbuda")){\r
+                       word = word.replace("Antigua", "Antigua-Barbuda");\r
+               }\r
+               if (! word.contains("St.")){\r
+                       word = word.replace("St Vincent", "St.Vincent");\r
+                       word = word.replace("St Lucia", "St.Lucia");\r
+                       word = word.replace("St Helena", "St.Helena");\r
+               }\r
+               word = word.replace("Asia-tropical", "Asia-Tropical");\r
+               word = word.replace("Society Islands", "Society Is.");\r
+               word = word.replace("Virgin Islands", "Virgin Is.");\r
+               word = word.replace("Canary Islands", "Canary Is.");\r
+               word = word.replace("Rhode Island", "Rhode I.");\r
+               \r
+               \r
+               word = word.replace("Rodriguez", "Rodrigues");\r
+               word = word.replace("British Colombia", "British Columbia");\r
+               word = word.replace("Bermudas", "Bermuda");\r
+               word = word.replace("Tunesia", "Tunisia");\r
+               word = word.replace("Santos São Paulo", "São Paulo");\r
+               word = word.replace("Transvaal", "Northern Provinces");\r
+               word = word.replace("Tucumán", "Tucuman");\r
                \r
                \r
 //             unknownAreas.add("Baltic amber");  \r
@@ -298,59 +357,25 @@ public class DipteraDistributionParser {
        }\r
        \r
        private void initStopWords(){\r
-               stopWords.add("to");\r
-               stopWords.add("also");\r
-               stopWords.add("almost");\r
                stopWords.add("and");\r
-               stopWords.add("cosmopolitan");\r
-               stopWords.add("s");\r
                stopWords.add("Is");\r
                stopWords.add("Is.");\r
+               stopWords.add("Islands");\r
+               stopWords.add("Island");\r
+               \r
                stopWords.add("of");\r
-               stopWords.add("bordering areas");\r
                stopWords.add("areas");\r
                stopWords.add("USA");\r
-               stopWords.add("Australia"); // except for "widesp. in Australia" !!\r
-               stopWords.add("&");\r
-               stopWords.add("part");\r
-               stopWords.add("excl");\r
-//             stopWords.add("European territory");  //part of Russian distributions\r
-               stopWords.add("northern part");\r
-               stopWords.add("Distr:");\r
-               \r
-               unknownAreas.add("Argentina");\r
+               stopWords.add("Australia"); //except for Australia only\r
+               stopWords.add("Argentina");             \r
+\r
                //unknownAreas.add("Panama");\r
                unknownAreas.add("South Africa");\r
-               unknownAreas.add("Indonesia");\r
                unknownAreas.add("Chile");\r
-//             unknownAreas.add("Wales");\r
-//             unknownAreas.add("Java");\r
-//             unknownAreas.add("former USSR: North European territory");\r
-//             unknownAreas.add("former USSR: South European territory");\r
-//             unknownAreas.add("former USSR: Soviet Middle Asia");\r
-//             unknownAreas.add("former USSR: North and Central European territory");\r
-//             unknownAreas.add("oceanian islands");\r
-//             unknownAreas.add("Ussuri region");\r
-//             unknownAreas.add("Galapagos Is.");\r
-//             unknownAreas.add("Malaysia");  // Malaysia Peninsular exists (level 4)\r
-               unknownAreas.add("West Indies");  //-> as a whole\r
-//             unknownAreas.add("Canal Zone");  \r
-//             unknownAreas.add("Polynesic Is.");  \r
-//             unknownAreas.add("Usbek SSR");  \r
-//             unknownAreas.add("Mexican amber");  \r
-//             unknownAreas.add("southern Europe");  // ->Southeastern Europe, Southwestern Europe\r
-//             unknownAreas.add("Marocco");  \r
-//             unknownAreas.add("Trinidad");  //-> Trinidad-Tobago\r
-//             unknownAreas.add("Haiti");  \r
-//             unknownAreas.add("Moluccas");  //-> Indonesia  \r
-//             unknownAreas.add("Belau");  \r
+\r
                unknownAreas.add("Baltic amber");  \r
                unknownAreas.add("Arabia"); \r
-//             unknownAreas.add("Dominican amber"); \r
-//             unknownAreas.add("Canary and Madeira Is.");  //-> Canary Is. / Madeira \r
-//             unknownAreas.add("Dominican amber"); \r
-//             unknownAreas.add("Far East"); \r
-//             unknownAreas.add("Tahiti"); \r
+\r
                        \r
                higherAreas.add("AF");\r
                higherAreas.add("OR");\r
@@ -366,7 +391,7 @@ public class DipteraDistributionParser {
         * @param args\r
         */\r
        public static void main(String[] args) {\r
-               ICdmDataSource cdmDestination = CdmDestinations.localH2();\r
+               ICdmDataSource cdmDestination = CdmDestinations.cdm_test_andreasM2();\r
                CdmApplicationController app = null;\r
                try {\r
                        DbSchemaValidation val = DbSchemaValidation.UPDATE;\r