DipteraDistribution and Cichorieae Feature Tree
authorAndreas Müller <a.mueller@bgbm.org>
Tue, 4 Nov 2008 16:37:31 +0000 (16:37 +0000)
committerAndreas Müller <a.mueller@bgbm.org>
Tue, 4 Nov 2008 16:37:31 +0000 (16:37 +0000)
.gitattributes
app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/CichorieaeActivator.java
app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraActivator.java
app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraDistributionParser.java [new file with mode: 0644]

index aee1d2912fab6f417dc5c04c7aa0510dc007b61f..8c17273d1b1f66c0c561a5d496f95f46b75a26a6 100644 (file)
@@ -8,6 +8,7 @@ app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/BerlinModelImpor
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/BerlinModelSources.java -text
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/CichorieaeActivator.java -text
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraActivator.java -text
+app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraDistributionParser.java -text
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/ErmsActivator.java -text
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/EuroMedActivator.java -text
 app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/SalvadorActivator.java -text
index 8c62d0205c37c0621484512a6dca21d265db0da3..41f85d402bf2dc3c43d9489f9f400493fe5afa11 100644 (file)
@@ -52,7 +52,7 @@ public class CichorieaeActivator {
        static final int sourceSecId = 7800000;\r
        \r
        static final UUID featureTreeUuid = UUID.fromString("ae9615b8-bc60-4ed0-ad96-897f9226d568");\r
-       static final Object[] featureKeyList = new Integer[]{5,10,11,12};       \r
+       static final Object[] featureKeyList = new Integer[]{1, 4, 5, 10, 11, 12};      \r
        \r
        static final String mediaUrlString = "http://wp5.e-taxonomy.eu/dataportal/cichorieae/media/protolog/";\r
        //Mac\r
index c920a3ceb11443ec19bc89b4b3b360c0f3760cde..7157c4ced76afe2ef7b7aaf56f987906a513f389 100644 (file)
@@ -22,7 +22,6 @@ import eu.etaxonomy.cdm.io.common.CdmDefaultImport;
 import eu.etaxonomy.cdm.io.common.Source;\r
 import eu.etaxonomy.cdm.io.common.IImportConfigurator.CHECK;\r
 import eu.etaxonomy.cdm.io.common.IImportConfigurator.DO_REFERENCES;\r
-import eu.etaxonomy.cdm.io.tcs.TcsImportConfigurator;\r
 import eu.etaxonomy.cdm.model.common.ISourceable;\r
 import eu.etaxonomy.cdm.model.description.FeatureTree;\r
 import eu.etaxonomy.cdm.model.name.NomenclaturalCode;\r
@@ -137,7 +136,11 @@ public class DipteraActivator {
                        CdmApplicationController app = bmImportConfigurator.getCdmAppController();\r
                        ISourceable obj = app.getCommonService().getSourcedObjectByIdInSource(ZoologicalName.class, "1000027", null);\r
                        logger.info(obj);\r
-                       \r
+               \r
+                       //parse distributions\r
+                       DipteraDistributionParser dipDist = new DipteraDistributionParser();\r
+                       dipDist.doDistribution(app);\r
+                                               \r
                        //make feature tree\r
                        FeatureTree tree = TreeCreator.flatTree(featureTreeUuid, bmImportConfigurator.getFeatureMap(), featureKeyList);\r
                        app = bmImportConfigurator.getCdmAppController();\r
diff --git a/app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraDistributionParser.java b/app-import/src/main/java/eu/etaxonomy/cdm/app/berlinModelImport/DipteraDistributionParser.java
new file mode 100644 (file)
index 0000000..fa3e377
--- /dev/null
@@ -0,0 +1,385 @@
+/**\r
+* Copyright (C) 2007 EDIT\r
+* European Distributed Institute of Taxonomy \r
+* http://www.e-taxonomy.eu\r
+* \r
+* The contents of this file are subject to the Mozilla Public License Version 1.1\r
+* See LICENSE.TXT at the top of this package for the full license terms.\r
+*/\r
+\r
+package eu.etaxonomy.cdm.app.berlinModelImport;\r
+\r
+import java.util.ArrayList;\r
+import java.util.HashSet;\r
+import java.util.List;\r
+import java.util.Set;\r
+import java.util.regex.Pattern;\r
+\r
+import org.apache.log4j.Logger;\r
+import org.springframework.transaction.TransactionStatus;\r
+\r
+import eu.etaxonomy.cdm.api.application.CdmApplicationController;\r
+import eu.etaxonomy.cdm.app.common.CdmDestinations;\r
+import eu.etaxonomy.cdm.database.DataSourceNotFoundException;\r
+import eu.etaxonomy.cdm.database.DbSchemaValidation;\r
+import eu.etaxonomy.cdm.database.ICdmDataSource;\r
+import eu.etaxonomy.cdm.model.common.Language;\r
+import eu.etaxonomy.cdm.model.common.init.TermNotFoundException;\r
+import eu.etaxonomy.cdm.model.description.DescriptionBase;\r
+import eu.etaxonomy.cdm.model.description.DescriptionElementBase;\r
+import eu.etaxonomy.cdm.model.description.Distribution;\r
+import eu.etaxonomy.cdm.model.description.Feature;\r
+import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;\r
+import eu.etaxonomy.cdm.model.description.PresenceTerm;\r
+import eu.etaxonomy.cdm.model.description.TaxonDescription;\r
+import eu.etaxonomy.cdm.model.description.TextData;\r
+import eu.etaxonomy.cdm.model.location.NamedArea;\r
+import eu.etaxonomy.cdm.model.location.TdwgArea;\r
+import eu.etaxonomy.cdm.model.taxon.Taxon;\r
+import eu.etaxonomy.cdm.model.taxon.TaxonBase;\r
+\r
+/**\r
+ * @author a.mueller\r
+ * @created 17.10.2008\r
+ * @version 1.0\r
+ */\r
+public class DipteraDistributionParser {\r
+       private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);\r
+       \r
+       final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )\r
+       static Pattern pattern = null;\r
+       \r
+       protected void doDistribution(CdmApplicationController app){\r
+               pattern = Pattern.compile(epiSplitter); \r
+           TransactionStatus txStatus = app.startTransaction();\r
+               List<TaxonBase> taxa = app.getTaxonService().getAllTaxonBases(1000000, 0);\r
+               for (TaxonBase taxon: taxa ){\r
+                       if (taxon instanceof Taxon){\r
+               //              unlazyDescription(app, (Taxon)taxon);\r
+                               Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();\r
+                               for (DescriptionBase description: descriptions){\r
+                                       Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();\r
+                                       descElements.addAll(description.getElements());\r
+                                       \r
+                                       for (DescriptionElementBase descEl: descElements){\r
+                                               if (descEl.getFeature().equals(Feature.OCCURRENCE())){\r
+                                                       if (descEl instanceof TextData){\r
+                                                               String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
+                                                               parseOccurenceString(occString, description);\r
+                                                       }\r
+                                               }\r
+                                       }\r
+                               }\r
+                       }\r
+               }\r
+               System.out.println("Unknowns: ");\r
+               for (String unknown: unrekognizedStrings){\r
+                       System.out.println(unknown);\r
+               }\r
+               System.out.println("Distributions not recognized: " + countNot);\r
+               System.out.println("Distributions created: " + countYes);\r
+               app.commitTransaction(txStatus);\r
+       }\r
+       \r
+       static Set<String> unrekognizedStrings = new HashSet<String>();\r
+       static int countNot = 0;\r
+       static int countYes = 0;\r
+       \r
+       private void parseOccurenceString(String occString, DescriptionBase desc){\r
+               System.out.println(occString);\r
+               if (occString != null){\r
+                       String[] words = pattern.split(occString);\r
+                       int i = 0;\r
+                       int countSkip = 0;\r
+                       for (String word: words){\r
+                               boolean isDoubtful = false;\r
+                               if (countSkip > 0){\r
+                                       countSkip--;\r
+                               }else if(word.contains("widesp") || word.equals("in")) {\r
+                                       //skip\r
+                               }else if(word.trim().length() == 0){\r
+                                       //skip\r
+                               }else{\r
+                                       if (word.endsWith(":") && word.length()<=4){\r
+                                               //Higher area\r
+                                               //TODO\r
+                                       }else{\r
+                                               word = word.trim();\r
+                                               if (word.contains("?")){\r
+                                                       isDoubtful = true;\r
+                                                       word = word.replace("?", "");\r
+                                               }\r
+                                               word = adaptWordsToTdwg(word);\r
+                                               \r
+                                               if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+                                                       for (countSkip = 1; countSkip <= 6; countSkip++){\r
+                                                               word = word.trim();\r
+                                                               if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+                                                                       if (words.length > i + countSkip){\r
+                                                                               word = word + " " + words[i + countSkip];\r
+                                                                       }\r
+                                                                       if (word.contains("?")){\r
+                                                                               isDoubtful = true;\r
+                                                                               word = word.replace("?", "");\r
+                                                                       }\r
+                                                                       word = adaptWordsToTdwg(word);\r
+                                                                       if ("".equals(word)){\r
+                                                                               break;\r
+                                                                       }\r
+                                                               }else{\r
+                                                                       break;\r
+                                                               }\r
+                                                       }\r
+                                               }\r
+                                               if ("".equals(word)){\r
+                                                       //countSkip = countSkip;\r
+                                               }else if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)  ){\r
+                                                       if (word.contains("?")){\r
+                                                               logger.warn("XXX");\r
+                                                       }\r
+                                                       countNot++;\r
+                                                       System.out.println("   False:" + countNot + ": " + word);\r
+                                                       unrekognizedStrings.add(word);\r
+                                                       countSkip = 0;\r
+                                               }else{\r
+                                                       PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();\r
+                                                       if (isDoubleArea(word)){\r
+                                                               NamedArea[] doubleArea = getDoubleArea(word);\r
+                                                               for (NamedArea area : doubleArea){\r
+                                                                       Distribution distr = Distribution.NewInstance(area, term);\r
+                                                                       desc.addElement(distr);\r
+                                                               }\r
+                                                       }else{\r
+                                                               NamedArea area = TdwgArea.getAreaByTdwgLabel(word);\r
+                                                               if (isDoubtful){\r
+                                                                       term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
+                                                               }\r
+                                                               Distribution distr = Distribution.NewInstance(area, term);\r
+                                                               desc.addElement(distr);\r
+                                                       }\r
+                                                       countYes++;\r
+                                                       System.out.println("      True:" + countYes + ": " + word);\r
+                                                       countSkip--;\r
+                                               }\r
+                                       }\r
+                               }\r
+                               i++;\r
+                       }\r
+               }\r
+       }\r
+       \r
+       private boolean isDoubleArea(String word){\r
+               if ("Canary and Madeira Is.".equalsIgnoreCase(word) || \r
+                               "southern Europe".equalsIgnoreCase(word) ||\r
+                               "former USSR: North and Central European territory".equalsIgnoreCase(word)\r
+                               ){\r
+                       return true;\r
+               }else{\r
+                       return false;\r
+               }\r
+       }\r
+       \r
+       private NamedArea[] getDoubleArea(String word){\r
+               NamedArea[] result = new NamedArea[2];\r
+               if ("Canary and Madeira Is.".equalsIgnoreCase(word)){\r
+                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+               }else if ("southern Europe".equalsIgnoreCase(word)){\r
+                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+               }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){\r
+                        result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+                        result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+               }else{\r
+                       logger.warn("Double area not recognized");\r
+               }\r
+               return result;\r
+       }\r
+       \r
+       \r
+       static List<String> stopWords = new ArrayList<String>();\r
+       static List<String> unknownAreas = new ArrayList<String>();\r
+       static List<String> higherAreas = new ArrayList<String>();\r
+       \r
+       private String adaptWordsToTdwg(String word){\r
+               word = word.replace(",", "").replace(".", "").replace(";", "");\r
+               word = word.replace("Caronlina", "Carolina");\r
+               \r
+               word = word.trim();\r
+               if (word.endsWith("Is")){\r
+                       word = word + ".";\r
+               }\r
+               if (stopWords.size() == 0){\r
+                       initStopWords();\r
+               }\r
+               \r
+               word = word.replace("Russia [North European territory]", "North European Russia");\r
+               word = word.replace("Russia North European territory", "North European Russia");\r
+               word = word.replace("Russia: North European territory", "North European Russia");\r
+               word = word.replace("Russia: North European territory", "North European Russia");\r
+                               \r
+               word = word.replace("Amber", "amber");\r
+               \r
+               \r
+               word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");\r
+               //or word = word.replace("Prince Edward Is.", "Prince Edward I.");\r
+               word = word.replace("Bahama Is.", "Bahamas");\r
+               word = word.replace("Comores Is.", "Comoros");\r
+               word = word.replace("former Yugoslavia", "Yugoslavia");\r
+               word = word.replace("former Czechoslovakia", "Czechoslovakia");\r
+               word = word.replace("Rhodesia", "Zimbabwe");\r
+               if (!word.contains("El Salvador")){\r
+                       word = word.replace("Salvador", "El Salvador"); \r
+               }\r
+               word = word.replace("Vera Cruz", "Veracruz");\r
+               word = word.replace("Turkmenia", "Turkmenistan");\r
+               word = word.replace("Quebec", "Québec");\r
+               word = word.replace("Gambia", "Gambia, The");\r
+               word = word.replace("Mariana Is.", "Marianas");\r
+               word = word.replace("Kenia", "Kenya");\r
+               word = word.replace("Central Africa", "Central African Republic");\r
+               word = word.replace("Panama", "Panamá");\r
+               word = word.replace("Wales", "Great Britain");  //?? Problem mit New South Wales??\r
+               word = word.replace("Java", "Jawa");\r
+               word = word.replace("former USSR: North European territory", "North European Russia");\r
+               word = word.replace("former USSR: South European territory", "South European Russia");\r
+               word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");\r
+               \r
+               word = word.replace("oceanian islands", "Pacific");\r
+               word = word.replace("Primorye", "Ussuri region");\r
+               word = word.replace("Galapagos Is.", "Galápagos");\r
+               word = word.replace("Malaysia", "Peninsular Malaysia");\r
+               word = word.replace("Canal Zone", "Panamá");\r
+               word = word.replace("Polynesic Is.", "South Solomons");\r
+\r
+               word = word.replace("Usbek SSR", "Uzbekistan");\r
+               word = word.replace("Mexican amber", "Mexico");\r
+               word = word.replace("southern Europe", "Ussuri region");\r
+               word = word.replace("Marocco", "Morocco");\r
+               word = word.replace("Trinidad", "Trinidad-Tobago");\r
+               word = word.replace("Haiti", "Haiti");  //??\r
+               word = word.replace("Moluccas", "Maluku");\r
+               word = word.replace("Belau", "Palau");\r
+               word = word.replace("Dominican amber", "Dominican Republic");\r
+               word = word.replace("Far East", "Russian Far East");\r
+               word = word.replace("Tahiti", "Society Is.");\r
+\r
+               \r
+               \r
+               unknownAreas.add("Baltic amber");  \r
+               unknownAreas.add("Arabia"); \r
+\r
+               \r
+               \r
+               \r
+                                               \r
+               for (String stopWord : stopWords){\r
+                       if (stopWord.equals(word)){\r
+                               System.out.println("         STOP: " + word);\r
+                               return "";\r
+                       }\r
+               }\r
+               for (String unknownArea : unknownAreas){\r
+                       if (unknownArea.equals(word)){\r
+                               System.out.println("         UNKNOWN: " + word);\r
+                               return "";\r
+                       }\r
+               }\r
+               for (String higherArea : higherAreas){\r
+                       if (higherArea.equals(word)){\r
+                               return "";\r
+                       }\r
+               }\r
+               \r
+               //higher regions\r
+               \r
+               return word;\r
+       }\r
+       \r
+       private void initStopWords(){\r
+               stopWords.add("to");\r
+               stopWords.add("also");\r
+               stopWords.add("almost");\r
+               stopWords.add("and");\r
+               stopWords.add("cosmopolitan");\r
+               stopWords.add("s");\r
+               stopWords.add("Is");\r
+               stopWords.add("Is.");\r
+               stopWords.add("of");\r
+               stopWords.add("bordering areas");\r
+               stopWords.add("areas");\r
+               stopWords.add("USA");\r
+               stopWords.add("Australia"); // except for "widesp. in Australia" !!\r
+               stopWords.add("&");\r
+               stopWords.add("part");\r
+               stopWords.add("excl");\r
+//             stopWords.add("European territory");  //part of Russian distributions\r
+               stopWords.add("northern part");\r
+               stopWords.add("Distr:");\r
+               \r
+               unknownAreas.add("Argentina");\r
+               //unknownAreas.add("Panama");\r
+               unknownAreas.add("South Africa");\r
+               unknownAreas.add("Indonesia");\r
+               unknownAreas.add("Chile");\r
+//             unknownAreas.add("Wales");\r
+//             unknownAreas.add("Java");\r
+//             unknownAreas.add("former USSR: North European territory");\r
+//             unknownAreas.add("former USSR: South European territory");\r
+//             unknownAreas.add("former USSR: Soviet Middle Asia");\r
+               unknownAreas.add("former USSR: North and Central European territory");\r
+//             unknownAreas.add("oceanian islands");\r
+//             unknownAreas.add("Ussuri region");\r
+//             unknownAreas.add("Galapagos Is.");\r
+//             unknownAreas.add("Malaysia");  // Malaysia Peninsular exists (level 4)\r
+               unknownAreas.add("West Indies");  //-> as a whole\r
+//             unknownAreas.add("Canal Zone");  \r
+//             unknownAreas.add("Polynesic Is.");  \r
+//             unknownAreas.add("Usbek SSR");  \r
+//             unknownAreas.add("Mexican amber");  \r
+//             unknownAreas.add("southern Europe");  // ->Southeastern Europe, Southwestern Europe\r
+//             unknownAreas.add("Marocco");  \r
+//             unknownAreas.add("Trinidad");  //-> Trinidad-Tobago\r
+//             unknownAreas.add("Haiti");  \r
+//             unknownAreas.add("Moluccas");  //-> Indonesia  \r
+//             unknownAreas.add("Belau");  \r
+               unknownAreas.add("Baltic amber");  \r
+               unknownAreas.add("Arabia"); \r
+//             unknownAreas.add("Dominican amber"); \r
+//             unknownAreas.add("Canary and Madeira Is.");  //-> Canary Is. / Madeira \r
+//             unknownAreas.add("Dominican amber"); \r
+//             unknownAreas.add("Far East"); \r
+//             unknownAreas.add("Tahiti"); \r
+                       \r
+               higherAreas.add("AF");\r
+               higherAreas.add("OR");\r
+               higherAreas.add("PA");\r
+               higherAreas.add("AU");\r
+               higherAreas.add("NE");\r
+               \r
+               higherAreas.add("NT");\r
+       }\r
+\r
+       \r
+       /**\r
+        * @param args\r
+        */\r
+       public static void main(String[] args) {\r
+               ICdmDataSource cdmDestination = CdmDestinations.localH2();\r
+               CdmApplicationController app = null;\r
+               try {\r
+                       DbSchemaValidation val = DbSchemaValidation.UPDATE;\r
+                       app = CdmApplicationController.NewInstance(cdmDestination, val);\r
+               } catch (DataSourceNotFoundException e) {\r
+                       e.printStackTrace();\r
+               } catch (TermNotFoundException e) {\r
+                       e.printStackTrace();\r
+               }\r
+               DipteraDistributionParser dipDist = new DipteraDistributionParser();\r
+               if (app != null){\r
+                       dipDist.doDistribution(app);\r
+               }else{\r
+                       logger.warn("No Application Context");\r
+               }\r
+       }\r
+}\r