app-import/src/main/java/eu/etaxonomy/cdm/app/wp6/diptera/DipteraDistributionParser.java

   1 /**
   2 * Copyright (C) 2007 EDIT
   3 * European Distributed Institute of Taxonomy
   4 * http://www.e-taxonomy.eu
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version 1.1
   7 * See LICENSE.TXT at the top of this package for the full license terms.
   8 */
   9
  10 /**
  11 * Copyright (C) 2007 EDIT
  12 * European Distributed Institute of Taxonomy
  13 * http://www.e-taxonomy.eu
  14 *
  15 * The contents of this file are subject to the Mozilla Public License Version 1.1
  16 * See LICENSE.TXT at the top of this package for the full license terms.
  17 */
  18 package eu.etaxonomy.cdm.app.wp6.diptera;
  19
  20 import java.util.ArrayList;
  21 import java.util.HashSet;
  22 import java.util.List;
  23 import java.util.Set;
  24 import java.util.regex.Pattern;
  25
  26 import org.apache.log4j.Logger;
  27 import org.springframework.transaction.TransactionStatus;
  28
  29 import eu.etaxonomy.cdm.api.application.CdmApplicationController;
  30 import eu.etaxonomy.cdm.api.application.ICdmRepository;
  31 import eu.etaxonomy.cdm.app.common.CdmDestinations;
  32 import eu.etaxonomy.cdm.database.DbSchemaValidation;
  33 import eu.etaxonomy.cdm.database.ICdmDataSource;
  34 import eu.etaxonomy.cdm.io.common.TdwgAreaProvider;
  35 import eu.etaxonomy.cdm.model.common.Language;
  36 import eu.etaxonomy.cdm.model.description.DescriptionBase;
  37 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  38 import eu.etaxonomy.cdm.model.description.Distribution;
  39 import eu.etaxonomy.cdm.model.description.Feature;
  40 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTerm;
  41 import eu.etaxonomy.cdm.model.description.TaxonDescription;
  42 import eu.etaxonomy.cdm.model.description.TextData;
  43 import eu.etaxonomy.cdm.model.location.NamedArea;
  44 import eu.etaxonomy.cdm.model.taxon.Taxon;
  45 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  46
  47 /**
  48  * @author a.mueller
  49  * @created 17.10.2008
  50  * @version 1.0
  51  */
  52 public class DipteraDistributionParser {
  53         private static final Logger logger = Logger.getLogger(DipteraDistributionParser.class);
  54
  55         private static ICdmDataSource cdmDestination = CdmDestinations.localH2();
  56
  57         final static String epiSplitter = "(\\s+|\\[|\\]|\\(|\\))"; //( ' '+| '(' | ')'| '[' | ']' )
  58         static Pattern pattern = null;
  59
  60         protected void doDistribution(ICdmRepository app){
  61                 pattern = Pattern.compile(epiSplitter);
  62             TransactionStatus txStatus = app.startTransaction();
  63                 List<TaxonBase> taxa = app.getTaxonService().list(null, null, null, null, null);
  64                 for (TaxonBase taxon: taxa ){
  65                         if (taxon instanceof Taxon){
  66                 //              unlazyDescription(app, (Taxon)taxon);
  67                                 Set<TaxonDescription> descriptions = ((Taxon) taxon).getDescriptions();
  68                                 for (DescriptionBase description: descriptions){
  69                                         Set<DescriptionElementBase> descElements = new HashSet<DescriptionElementBase>();
  70                                         descElements.addAll(description.getElements());
  71
  72                                         for (DescriptionElementBase descEl: descElements){
  73                                                 if (descEl.getFeature().equals(Feature.OCCURRENCE())){
  74                                                         if (descEl instanceof TextData){
  75                                                                 String occString = ((TextData)descEl).getText(Language.ENGLISH());
  76                                                                 parseOccurenceString(occString, description);
  77                                                                 //app.getTaxonService().saveTaxon(taxon);
  78                                                         }
  79                                                 }
  80                                         }
  81                                 }
  82                         }
  83                 }
  84                 System.out.println("Unknowns: ");
  85                 for (String unknown: unrekognizedStrings){
  86                         System.out.println(unknown);
  87                 }
  88                 System.out.println("Distributions not recognized: " + countNot);
  89                 System.out.println("Distributions created: " + countYes);
  90                 app.commitTransaction(txStatus);
  91         }
  92
  93         static Set<String> unrekognizedStrings = new HashSet<String>();
  94         static int countNot = 0;
  95         static int countYes = 0;
  96
  97         private void parseOccurenceString(String occString, DescriptionBase desc){
  98                 System.out.println(occString);
  99                 if (occString != null){
 100                         String[] words = pattern.split(occString);
 101                         int i = 0;
 102                         int countSkip = 0;
 103                         for (String word: words){
 104                                 if (word.contains("U.S.A")){
 105                                         logger.warn("U.S.A.");
 106                                 }
 107                                 boolean isDoubtful = false;
 108                                 if (countSkip > 0){
 109                                         countSkip--;
 110                                 }else if(word.trim().length() == 0){
 111                                         //skip
 112                                 }else{
 113                                         if (word.endsWith(":") && word.length()<=4){
 114                                                 //Higher area
 115                                                 //TODO
 116                                         }else{
 117                                                 word = word.trim();
 118                                                 if (word.contains("?")){
 119                                                         isDoubtful = true;
 120                                                         word = word.replace("?", "");
 121                                                 }
 122                                                 word = adaptWordsToTdwg(word);
 123
 124                                                 if (! "".equals(word) && ! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
 125                                                         for (countSkip = 1; countSkip <= 6; countSkip++){
 126                                                                 word = word.trim();
 127                                                                 if (! TdwgAreaProvider.isTdwgAreaLabel(word) && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) && ! isDoubleArea(word)){
 128                                                                         if (words.length > i + countSkip){
 129                                                                                 word = word + " " + words[i + countSkip];
 130                                                                         }
 131                                                                         if (word.contains("?")){
 132                                                                                 isDoubtful = true;
 133                                                                                 word = word.replace("?", "");
 134                                                                         }
 135                                                                         word = adaptWordsToTdwg(word);
 136                                                                         if ("".equals(word)){
 137                                                                                 break;
 138                                                                         }
 139                                                                 }else{
 140                                                                         break;
 141                                                                 }
 142                                                         }
 143                                                 }
 144                                                 if ("".equals(word)){
 145                                                         //countSkip = countSkip;
 146                                                 }else if (! TdwgAreaProvider.isTdwgAreaLabel(word)  && ! TdwgAreaProvider.isTdwgAreaAbbreviation(word) &&  ! isDoubleArea(word)  ){
 147                                                         if (word.contains("?")){
 148                                                                 logger.warn("XXX");
 149                                                         }
 150                                                         countNot++;
 151                                                         System.out.println("   False:" + countNot + ": " + word);
 152                                                         unrekognizedStrings.add(word);
 153                                                         countSkip = 0;
 154                                                 }else{
 155                                                         if (word.equals("Netherlands")){
 156                                                                 if ( countSkip < 0 && words[i + 1].startsWith("Antilles")){
 157                                                                         word = "Netherlands Antilles";
 158                                                                         countSkip=2;
 159                                                                 }
 160                                                         }
 161                                                         PresenceAbsenceTerm term = PresenceAbsenceTerm.PRESENT();
 162                                                         if (isDoubleArea(word)){
 163                                                                 NamedArea[] doubleArea = getDoubleArea(word);
 164                                                                 for (NamedArea area : doubleArea){
 165                                                                         Distribution distr = Distribution.NewInstance(area, term);
 166                                                                         desc.addElement(distr);
 167                                                                 }
 168                                                         }else{
 169                                                                 NamedArea area;
 170                                                                 if (TdwgAreaProvider.isTdwgAreaLabel(word)){
 171                                                                         area = TdwgAreaProvider.getAreaByTdwgLabel(word);
 172                                                                 }else{
 173                                                                         area = TdwgAreaProvider.getAreaByTdwgAbbreviation(word);
 174                                                                 }
 175                                                                 if (isDoubtful){
 176                                                                         term = PresenceAbsenceTerm.INTRODUCED_PRESENCE_QUESTIONABLE();
 177                                                                 }
 178                                                                 Distribution distr = Distribution.NewInstance(area, term);
 179                                                                 desc.addElement(distr);
 180                                                         }
 181                                                         countYes++;
 182                                                         System.out.println("      True:" + countYes + ": " + word);
 183                                                         countSkip--;
 184                                                 }
 185                                         }
 186                                 }
 187                                 i++;
 188                         }
 189                 }
 190         }
 191
 192         private boolean isDoubleArea(String word){
 193                 if ("Canary and Madeira Is.".equalsIgnoreCase(word) ||
 194                                 "southern Europe".equalsIgnoreCase(word) ||
 195                                 "former USSR: North and Central European territory".equalsIgnoreCase(word)
 196                                 ){
 197                         return true;
 198                 }else{
 199                         return false;
 200                 }
 201         }
 202
 203         private NamedArea[] getDoubleArea(String word){
 204                 NamedArea[] result = new NamedArea[2];
 205                 if ("Canary and Madeira Is.".equalsIgnoreCase(word)){
 206                          result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("CNY");
 207                          result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("MDR");
 208                 }else if ("southern Europe".equalsIgnoreCase(word)){
 209                          result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("12");
 210                          result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("13");
 211                 }else if ("former USSR: North and Central European territory".equalsIgnoreCase(word)){
 212                          result[0] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUN-OO");
 213                          result[1] = TdwgAreaProvider.getAreaByTdwgAbbreviation("RUC-OO");
 214                 }else{
 215                         logger.warn("Double area not recognized");
 216                 }
 217                 return result;
 218         }
 219
 220
 221         static List<String> stopWords = new ArrayList<String>();
 222         static List<String> unknownAreas = new ArrayList<String>();
 223         static List<String> higherAreas = new ArrayList<String>();
 224
 225         private String adaptWordsToTdwg(String word){
 226                 word = word.replace(",", "").replace(";", "");
 227                 if (! word.contains("U.S.A")){
 228                         word = word.replace(",", "").replace(".", "").replace(";", "");
 229                 }else{
 230                         word = word.replace(",", "").replace(";", "");
 231                 }
 232
 233                 word = word.trim();
 234                 if (word.endsWith("Is")){
 235                         word = word + ".";
 236                 }
 237                 if (stopWords.size() == 0){
 238                         initStopWords();
 239                 }
 240
 241                 word = word.replace("Russia [North European territory]", "North European Russia");
 242                 word = word.replace("Russia North European territory", "North European Russia");
 243                 word = word.replace("Russia: North European territory", "North European Russia");
 244                 word = word.replace("Russia: North European territory", "North European Russia");
 245
 246                 word = word.replace("Amber", "amber");
 247
 248
 249                 word = word.replace("Prince Edward Is.", "Marion-Prince Edward Is.");
 250                 //or word = word.replace("Prince Edward Is.", "Prince Edward I.");
 251                 word = word.replace("Bahama Is.", "Bahamas");
 252                 word = word.replace("Comores Is.", "Comoros");
 253                 word = word.replace("former Yugoslavia", "Yugoslavia");
 254                 word = word.replace("former Czechoslovakia", "Czechoslovakia");
 255                 word = word.replace("Rhodesia", "Zimbabwe");
 256                 word = word.replace("The Gambia", "Gambia, The");
 257
 258                 if (!word.contains("El Salvador")){
 259                         word = word.replace("Salvador", "El Salvador");
 260                 }
 261                 word = word.replace("Vera Cruz", "Veracruz");
 262                 word = word.replace("Turkmenia", "Turkmenistan");
 263                 word = word.replace("Qu\u00E9beck", "Qu\u00E9bec");
 264                 word = word.replace("Quebeck", "Qu\u00E9bec");
 265                 word = word.replace("Quebec", "Qu\u00E9bec");
 266
 267                 if (!word.contains("Gambia, The")){
 268                         word = word.replace("Gambia", "Gambia, The");
 269                 }
 270                 word = word.replace("Mariana Is.", "Marianas");
 271                 word = word.replace("Kenia", "Kenya");
 272                 word = word.replace("Central Africa", "Central African Republic");
 273                 word = word.replace("Canal Zone", "");
 274                 //word = word.replace("Panama", "PanamÃ¡");
 275                 word = word.replace("Panama", "Panam\u00E1");
 276                 if (! word.contains("New South Wales")){
 277                         word = word.replace("Wales", "Great Britain");
 278                 }
 279                 word = word.replace("Java", "Jawa");
 280                 word = word.replace("former USSR: North European territory", "North European Russia");
 281                 word = word.replace("former USSR: South European territory", "South European Russia");
 282                 word = word.replace("former USSR: Soviet Middle Asia", "Middle Asia");
 283
 284                 word = word.replace("St Kitts-Nevis", "St.Kitts-Nevis");
 285
 286                 word = word.replace("oceanian islands", "Pacific");
 287                 word = word.replace("Ussuri region", "Primorye");
 288                 word = word.replace("Galapagos Is.", "Gal\u00E1pagos");
 289                 word = word.replace("Tarapac\u00E1", "Tarapaca");
 290                 word = word.replace("Reunion", "R\u00E9union");
 291                 if (! word.contains("Is.")){
 292                         word = word.replace("Galapagos", "Gal\u00E1pagos");
 293                 }
 294
 295                 //word = word.replace("Galapagos Is.", "GalÃ¡pagos");
 296                 if (! word.contains("Peninsular")){
 297                         word = word.replace("Malaysia", "Peninsular Malaysia");
 298                 }
 299                 word = word.replace("Polynesic Is.", "South Solomons");
 300
 301                 word = word.replace("Usbek SSR", "Uzbekistan");
 302                 word = word.replace("Mexican amber", "Mexico");
 303                 word = word.replace("Marocco", "Morocco");
 304                 if (! word.contains("Tobago")){
 305                         word = word.replace("Trinidad", "Trinidad-Tobago");
 306                 }
 307                 if (! word.contains("Trinidad")){
 308                         word = word.replace("Tobago", "Trinidad-Tobago");
 309                 }
 310                 word = word.replace("Haiti", "Haiti");
 311                 word = word.replace("Moluccas", "Maluku");
 312                 word = word.replace("Belau", "Palau");
 313                 word = word.replace("Dominican amber", "Dominican Republic");
 314                 if (! word.contains("Russian")){
 315                         word = word.replace("Far East", "Russian Far East");
 316                 }
 317                 word = word.replace("Tahiti", "Society Is.");
 318                 word = word.replace("Iraque", "Iraq");
 319                 word = word.replace("Wake Island", "Wake I.");
 320                 if (! word.contains("I.")){
 321                         word = word.replace("Johnston I", "Johnston I.");
 322                         word = word.replace("Wake I", "Wake I.");
 323                         word = word.replace("Clipperton I", "Clipperton I.");
 324                 }
 325                 if (! word.contains("Provinces")){
 326                         word = word.replace("Cape Province", "Cape Provinces");
 327                 }
 328                 word = word.replace("Eastern Cape Provinces", "Eastern Cape Province");
 329                 word = word.replace("Western Cape Provinces", "Western Cape Province");
 330                 if (! word.contains("Barbuda")){
 331                         word = word.replace("Antigua", "Antigua-Barbuda");
 332                 }
 333                 if (! word.contains("St.")){
 334                         word = word.replace("St Vincent", "St.Vincent");
 335                         word = word.replace("St Lucia", "St.Lucia");
 336                         word = word.replace("St Helena", "St.Helena");
 337                 }
 338                 word = word.replace("Asia-tropical", "Asia-Tropical");
 339                 word = word.replace("Society Islands", "Society Is.");
 340                 word = word.replace("Virgin Islands", "Virgin Is.");
 341                 word = word.replace("Canary Islands", "Canary Is.");
 342                 word = word.replace("Rhode Island", "Rhode I.");
 343
 344
 345                 word = word.replace("Rodriguez", "Rodrigues");
 346                 word = word.replace("British Colombia", "British Columbia");
 347                 word = word.replace("Bermudas", "Bermuda");
 348                 word = word.replace("Tunesia", "Tunisia");
 349                 word = word.replace("Santos S\u00E3o Paulo", "S\u00E3o Paulo");
 350                 word = word.replace("Transvaal", "Northern Provinces");
 351                 word = word.replace("Tucum\u00E1n", "Tucuman");
 352 //              if (!word.contains("Netherlands")){
 353 //
 354 //              }
 355
 356 //              unknownAreas.add("Baltic amber");
 357 //              unknownAreas.add("Arabia");
 358
 359                 for (String stopWord : stopWords){
 360                         if (stopWord.equals(word)){
 361                                 System.out.println("         STOP: " + word);
 362                                 return "";
 363                         }
 364                 }
 365                 for (String unknownArea : unknownAreas){
 366                         if (unknownArea.equals(word)){
 367                                 System.out.println("         UNKNOWN: " + word);
 368                                 return "";
 369                         }
 370                 }
 371                 for (String higherArea : higherAreas){
 372                         if (higherArea.equals(word)){
 373                                 return "";
 374                         }
 375                 }
 376
 377                 //higher regions
 378
 379                 return word;
 380         }
 381
 382         private void initStopWords(){
 383                 stopWords.add("and");
 384                 stopWords.add("Is");
 385                 stopWords.add("Is.");
 386                 stopWords.add("Islands");
 387                 stopWords.add("Island");
 388
 389                 stopWords.add("of");
 390                 stopWords.add("areas");
 391                 stopWords.add("USA");
 392                 stopWords.add("Australia"); //except for Australia only
 393                 stopWords.add("Argentina");
 394
 395                 //unknownAreas.add("Panama");
 396                 unknownAreas.add("South Africa");
 397                 unknownAreas.add("Chile");
 398
 399                 unknownAreas.add("Baltic amber");
 400                 unknownAreas.add("Arabia");
 401
 402
 403                 higherAreas.add("AF");
 404                 higherAreas.add("OR");
 405                 higherAreas.add("PA");
 406                 higherAreas.add("AU");
 407                 higherAreas.add("NE");
 408
 409                 higherAreas.add("NT");
 410         }
 411
 412
 413         /**
 414          * @param args
 415          */
 416         public static void main(String[] args) {
 417                 CdmApplicationController app = null;
 418                 DbSchemaValidation val = DbSchemaValidation.UPDATE;
 419                 app = CdmApplicationController.NewInstance(cdmDestination, val);
 420
 421                 DipteraDistributionParser dipDist = new DipteraDistributionParser();
 422                 if (app != null){
 423                         dipDist.doDistribution(app);
 424                 }else{
 425                         logger.warn("No Application Context");
 426                 }
 427         }
 428 }