--- /dev/null
+/**\r
+* Copyright (C) 2007 EDIT\r
+* European Distributed Institute of Taxonomy \r
+* http://www.e-taxonomy.eu\r
+* \r
+* The contents of this file are subject to the Mozilla Public License Version 1.1\r
+* See LICENSE.TXT at the top of this package for the full license terms.\r
+*/\r
+\r
+package eu.etaxonomy.cdm.app.berlinModelImport;\r
+\r
+import java.util.ArrayList;\r
+import java.util.HashSet;\r
+import java.util.List;\r
+import java.util.Set;\r
+import java.util.regex.Pattern;\r
+\r
+import org.apache.log4j.Logger;\r
+import org.springframework.transaction.TransactionStatus;\r
+\r
+import eu.etaxonomy.cdm.api.application.CdmApplicationController;\r
+import eu.etaxonomy.cdm.app.common.CdmDestinations;\r
+import eu.etaxonomy.cdm.database.DataSourceNotFoundException;\r
+import eu.etaxonomy.cdm.database.DbSchemaValidation;\r
+import eu.etaxonomy.cdm.database.ICdmDataSource;\r
+import eu.etaxonomy.cdm.model.common.Language;\r
+import eu.etaxonomy.cdm.model.common.init.TermNotFoundException;\r
+import eu.etaxonomy.cdm.model.description.DescriptionBase;\r
+import eu.etaxonomy.cdm.model.description.DescriptionElementBase;\r
+import eu.etaxonomy.cdm.model.description.Distribution;\r
+import eu.etaxonomy.cdm.model.description.Feature;\r
+import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;\r
+import eu.etaxonomy.cdm.model.description.PresenceTerm;\r
+import eu.etaxonomy.cdm.model.description.TaxonDescription;\r
+import eu.etaxonomy.cdm.model.description.TextData;\r
+import eu.etaxonomy.cdm.model.location.NamedArea;\r
+import eu.etaxonomy.cdm.model.location.TdwgArea;\r
+import eu.etaxonomy.cdm.model.taxon.Taxon;\r
+import eu.etaxonomy.cdm.model.taxon.TaxonBase;\r
+\r
+/**\r
+ * @author a.mueller\r
+ * @created 17.10.2008\r
+ * @version 1.0\r
+ */\r
+public class DipteraDistributionParser {\r
+ private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);\r
+ \r
+ final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )\r
+ static Pattern pattern = null;\r
+ \r
+ protected void doDistribution(CdmApplicationController app){\r
+ pattern = Pattern.compile(epiSplitter); \r
+ TransactionStatus txStatus = app.startTransaction();\r
+ List<TaxonBase> taxa = app.getTaxonService().getAllTaxonBases(1000000, 0);\r
+ for (TaxonBase taxon: taxa ){\r
+ if (taxon instanceof Taxon){\r
+ // unlazyDescription(app, (Taxon)taxon);\r
+ Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();\r
+ for (DescriptionBase description: descriptions){\r
+ Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();\r
+ descElements.addAll(description.getElements());\r
+ \r
+ for (DescriptionElementBase descEl: descElements){\r
+ if (descEl.getFeature().equals(Feature.OCCURRENCE())){\r
+ if (descEl instanceof TextData){\r
+ String occString = ((TextData)descEl).getText(Language.ENGLISH());\r
+ parseOccurenceString(occString, description);\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ System.out.println("Unknowns: ");\r
+ for (String unknown: unrekognizedStrings){\r
+ System.out.println(unknown);\r
+ }\r
+ System.out.println("Distributions not recognized: " + countNot);\r
+ System.out.println("Distributions created: " + countYes);\r
+ app.commitTransaction(txStatus);\r
+ }\r
+ \r
+ static Set<String> unrekognizedStrings = new HashSet<String>();\r
+ static int countNot = 0;\r
+ static int countYes = 0;\r
+ \r
+ private void parseOccurenceString(String occString, DescriptionBase desc){\r
+ System.out.println(occString);\r
+ if (occString != null){\r
+ String[] words = pattern.split(occString);\r
+ int i = 0;\r
+ int countSkip = 0;\r
+ for (String word: words){\r
+ boolean isDoubtful = false;\r
+ if (countSkip > 0){\r
+ countSkip--;\r
+ }else if(word.contains("widesp") || word.equals("in")) {\r
+ //skip\r
+ }else if(word.trim().length() == 0){\r
+ //skip\r
+ }else{\r
+ if (word.endsWith(":") && word.length()<=4){\r
+ //Higher area\r
+ //TODO\r
+ }else{\r
+ word = word.trim();\r
+ if (word.contains("?")){\r
+ isDoubtful = true;\r
+ word = word.replace("?", "");\r
+ }\r
+ word = adaptWordsToTdwg(word);\r
+ \r
+ if (! "".equals(word) && ! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+ for (countSkip = 1; countSkip <= 6; countSkip++){\r
+ word = word.trim();\r
+ if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word)){\r
+ if (words.length > i + countSkip){\r
+ word = word + " " + words[i + countSkip];\r
+ }\r
+ if (word.contains("?")){\r
+ isDoubtful = true;\r
+ word = word.replace("?", "");\r
+ }\r
+ word = adaptWordsToTdwg(word);\r
+ if ("".equals(word)){\r
+ break;\r
+ }\r
+ }else{\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ if ("".equals(word)){\r
+ //countSkip = countSkip;\r
+ }else if (! TdwgArea.isTdwgAreaLabel(word) && ! isDoubleArea(word) ){\r
+ if (word.contains("?")){\r
+ logger.warn("XXX");\r
+ }\r
+ countNot++;\r
+ System.out.println(" False:" + countNot + ": " + word);\r
+ unrekognizedStrings.add(word);\r
+ countSkip = 0;\r
+ }else{\r
+ PresenceAbsenceTermBase<?> term = PresenceTerm.PRESENT();\r
+ if (isDoubleArea(word)){\r
+ NamedArea[] doubleArea = getDoubleArea(word);\r
+ for (NamedArea area : doubleArea){\r
+ Distribution distr = Distribution.NewInstance(area, term);\r
+ desc.addElement(distr);\r
+ }\r
+ }else{\r
+ NamedArea area = TdwgArea.getAreaByTdwgLabel(word);\r
+ if (isDoubtful){\r
+ term = PresenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();\r
+ }\r
+ Distribution distr = Distribution.NewInstance(area, term);\r
+ desc.addElement(distr);\r
+ }\r
+ countYes++;\r
+ System.out.println(" True:" + countYes + ": " + word);\r
+ countSkip--;\r
+ }\r
+ }\r
+ }\r
+ i++;\r
+ }\r
+ }\r
+ }\r
+ \r
+ private boolean isDoubleArea(String word){\r
+ if ("Canary and Madeira Is.".equalsIgnoreCase(word) || \r
+ "southern Europe".equalsIgnoreCase(word) ||\r
+ "former USSR: North and Central European territory".equalsIgnoreCase(word)\r
+ ){\r
+ return true;\r
+ }else{\r
+ return false;\r
+ }\r
+ }\r
+ \r
+ private NamedArea[] getDoubleArea(String word){\r
+ NamedArea[] result = new NamedArea[2];\r
+ if ("Canary and Madeira Is.".equalsIgnoreCase(word)){\r
+ result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ }else if ("southern Europe".equalsIgnoreCase(word)){\r
+ result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){\r
+ result[0] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ result[1] = TdwgArea.getAreaByTdwgAbbreviation("");\r
+ }else{\r
+ logger.warn("Double area not recognized");\r
+ }\r
+ return result;\r
+ }\r
+ \r
+ \r
+ static List<String> stopWords = new ArrayList<String>();\r
+ static List<String> unknownAreas = new ArrayList<String>();\r
+ static List<String> higherAreas = new ArrayList<String>();\r
+ \r
+ private String adaptWordsToTdwg(String word){\r
+ word = word.replace(",", "").replace(".", "").replace(";", "");\r
+ word = word.replace("Caronlina", "Carolina");\r
+ \r
+ word = word.trim();\r
+ if (word.endsWith("Is")){\r
+ word = word + ".";\r
+ }\r
+ if (stopWords.size() == 0){\r
+ initStopWords();\r
+ }\r
+ \r
+ word = word.replace("Russia [North European territory]", "North European Russia");\r
+ word = word.replace("Russia North European territory", "North European Russia");\r
+ word = word.replace("Russia: North European territory", "North European Russia");\r
+ word = word.replace("Russia: North European territory", "North European Russia");\r
+ \r
+ word = word.replace("Amber", "amber");\r
+ \r
+ \r
+ word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");\r
+ //or word = word.replace("Prince Edward Is.", "Prince Edward I.");\r
+ word = word.replace("Bahama Is.", "Bahamas");\r
+ word = word.replace("Comores Is.", "Comoros");\r
+ word = word.replace("former Yugoslavia", "Yugoslavia");\r
+ word = word.replace("former Czechoslovakia", "Czechoslovakia");\r
+ word = word.replace("Rhodesia", "Zimbabwe");\r
+ if (!word.contains("El Salvador")){\r
+ word = word.replace("Salvador", "El Salvador"); \r
+ }\r
+ word = word.replace("Vera Cruz", "Veracruz");\r
+ word = word.replace("Turkmenia", "Turkmenistan");\r
+ word = word.replace("Quebec", "Québec");\r
+ word = word.replace("Gambia", "Gambia, The");\r
+ word = word.replace("Mariana Is.", "Marianas");\r
+ word = word.replace("Kenia", "Kenya");\r
+ word = word.replace("Central Africa", "Central African Republic");\r
+ word = word.replace("Panama", "Panamá");\r
+ word = word.replace("Wales", "Great Britain"); //?? Problem mit New South Wales??\r
+ word = word.replace("Java", "Jawa");\r
+ word = word.replace("former USSR: North European territory", "North European Russia");\r
+ word = word.replace("former USSR: South European territory", "South European Russia");\r
+ word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");\r
+ \r
+ word = word.replace("oceanian islands", "Pacific");\r
+ word = word.replace("Primorye", "Ussuri region");\r
+ word = word.replace("Galapagos Is.", "Galápagos");\r
+ word = word.replace("Malaysia", "Peninsular Malaysia");\r
+ word = word.replace("Canal Zone", "Panamá");\r
+ word = word.replace("Polynesic Is.", "South Solomons");\r
+\r
+ word = word.replace("Usbek SSR", "Uzbekistan");\r
+ word = word.replace("Mexican amber", "Mexico");\r
+ word = word.replace("southern Europe", "Ussuri region");\r
+ word = word.replace("Marocco", "Morocco");\r
+ word = word.replace("Trinidad", "Trinidad-Tobago");\r
+ word = word.replace("Haiti", "Haiti"); //??\r
+ word = word.replace("Moluccas", "Maluku");\r
+ word = word.replace("Belau", "Palau");\r
+ word = word.replace("Dominican amber", "Dominican Republic");\r
+ word = word.replace("Far East", "Russian Far East");\r
+ word = word.replace("Tahiti", "Society Is.");\r
+\r
+ \r
+ \r
+ unknownAreas.add("Baltic amber"); \r
+ unknownAreas.add("Arabia"); \r
+\r
+ \r
+ \r
+ \r
+ \r
+ for (String stopWord : stopWords){\r
+ if (stopWord.equals(word)){\r
+ System.out.println(" STOP: " + word);\r
+ return "";\r
+ }\r
+ }\r
+ for (String unknownArea : unknownAreas){\r
+ if (unknownArea.equals(word)){\r
+ System.out.println(" UNKNOWN: " + word);\r
+ return "";\r
+ }\r
+ }\r
+ for (String higherArea : higherAreas){\r
+ if (higherArea.equals(word)){\r
+ return "";\r
+ }\r
+ }\r
+ \r
+ //higher regions\r
+ \r
+ return word;\r
+ }\r
+ \r
+ private void initStopWords(){\r
+ stopWords.add("to");\r
+ stopWords.add("also");\r
+ stopWords.add("almost");\r
+ stopWords.add("and");\r
+ stopWords.add("cosmopolitan");\r
+ stopWords.add("s");\r
+ stopWords.add("Is");\r
+ stopWords.add("Is.");\r
+ stopWords.add("of");\r
+ stopWords.add("bordering areas");\r
+ stopWords.add("areas");\r
+ stopWords.add("USA");\r
+ stopWords.add("Australia"); // except for "widesp. in Australia" !!\r
+ stopWords.add("&");\r
+ stopWords.add("part");\r
+ stopWords.add("excl");\r
+// stopWords.add("European territory"); //part of Russian distributions\r
+ stopWords.add("northern part");\r
+ stopWords.add("Distr:");\r
+ \r
+ unknownAreas.add("Argentina");\r
+ //unknownAreas.add("Panama");\r
+ unknownAreas.add("South Africa");\r
+ unknownAreas.add("Indonesia");\r
+ unknownAreas.add("Chile");\r
+// unknownAreas.add("Wales");\r
+// unknownAreas.add("Java");\r
+// unknownAreas.add("former USSR: North European territory");\r
+// unknownAreas.add("former USSR: South European territory");\r
+// unknownAreas.add("former USSR: Soviet Middle Asia");\r
+ unknownAreas.add("former USSR: North and Central European territory");\r
+// unknownAreas.add("oceanian islands");\r
+// unknownAreas.add("Ussuri region");\r
+// unknownAreas.add("Galapagos Is.");\r
+// unknownAreas.add("Malaysia"); // Malaysia Peninsular exists (level 4)\r
+ unknownAreas.add("West Indies"); //-> as a whole\r
+// unknownAreas.add("Canal Zone"); \r
+// unknownAreas.add("Polynesic Is."); \r
+// unknownAreas.add("Usbek SSR"); \r
+// unknownAreas.add("Mexican amber"); \r
+// unknownAreas.add("southern Europe"); // ->Southeastern Europe, Southwestern Europe\r
+// unknownAreas.add("Marocco"); \r
+// unknownAreas.add("Trinidad"); //-> Trinidad-Tobago\r
+// unknownAreas.add("Haiti"); \r
+// unknownAreas.add("Moluccas"); //-> Indonesia \r
+// unknownAreas.add("Belau"); \r
+ unknownAreas.add("Baltic amber"); \r
+ unknownAreas.add("Arabia"); \r
+// unknownAreas.add("Dominican amber"); \r
+// unknownAreas.add("Canary and Madeira Is."); //-> Canary Is. / Madeira \r
+// unknownAreas.add("Dominican amber"); \r
+// unknownAreas.add("Far East"); \r
+// unknownAreas.add("Tahiti"); \r
+ \r
+ higherAreas.add("AF");\r
+ higherAreas.add("OR");\r
+ higherAreas.add("PA");\r
+ higherAreas.add("AU");\r
+ higherAreas.add("NE");\r
+ \r
+ higherAreas.add("NT");\r
+ }\r
+\r
+ \r
+ /**\r
+ * @param args\r
+ */\r
+ public static void main(String[] args) {\r
+ ICdmDataSource cdmDestination = CdmDestinations.localH2();\r
+ CdmApplicationController app = null;\r
+ try {\r
+ DbSchemaValidation val = DbSchemaValidation.UPDATE;\r
+ app = CdmApplicationController.NewInstance(cdmDestination, val);\r
+ } catch (DataSourceNotFoundException e) {\r
+ e.printStackTrace();\r
+ } catch (TermNotFoundException e) {\r
+ e.printStackTrace();\r
+ }\r
+ DipteraDistributionParser dipDist = new DipteraDistributionParser();\r
+ if (app != null){\r
+ dipDist.doDistribution(app);\r
+ }else{\r
+ logger.warn("No Application Context");\r
+ }\r
+ }\r
+}\r