Fauna Europaea Import
authora.babadshanjan <a.babadshanjan@localhost>
Wed, 1 Jul 2009 16:05:24 +0000 (16:05 +0000)
committera.babadshanjan <a.babadshanjan@localhost>
Wed, 1 Jul 2009 16:05:24 +0000 (16:05 +0000)
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/faunaEuropaea/FaunaEuropaeaImportBase.java
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/faunaEuropaea/FaunaEuropaeaImportState.java
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/faunaEuropaea/FaunaEuropaeaNameImport.java
cdmlib-io/src/main/java/eu/etaxonomy/cdm/io/faunaEuropaea/FaunaEuropaeaRelTaxonIncludeImport.java

index 8c6bbd5f62d812743a296131943b83eac7a07c76..57ed27050556c4331bdf42e6d6142aa8af5949e6 100644 (file)
@@ -137,14 +137,16 @@ implements ICdmImport<FaunaEuropaeaImportConfigurator,FaunaEuropaeaImportState>
                                        + ", n = " + n); \r
                }\r
 \r
-               // save taxa in chunks of <=limit\r
+               // save taxa in blocks of <=limit\r
                \r
                for (int j = 1; j <= n + 1; j++)\r
                {\r
                        int offset = j - 1;\r
                        int start = offset * limit;\r
 \r
-                       if(logger.isInfoEnabled()) { logger.info("Saving taxa: " + start + " - " + (start + limit - 1)); }\r
+                       if(logger.isInfoEnabled()) { \r
+                               logger.info("Saving taxa: " + start + " - " + (start + limit - 1)); \r
+                       }\r
 \r
                        if(logger.isInfoEnabled()) { \r
                                logger.info("index = " + j \r
@@ -154,7 +156,9 @@ implements ICdmImport<FaunaEuropaeaImportConfigurator,FaunaEuropaeaImportState>
                        \r
                        if (j == n + 1) {\r
                                limit = nbrOfTaxa - n * limit;\r
-                               if(logger.isInfoEnabled()) { logger.info("n = " + n + ", limit = " + limit); }\r
+                               if(logger.isInfoEnabled()) { \r
+                                       logger.info("n = " + n + ", limit = " + limit); \r
+                               }\r
                        }\r
 \r
                TransactionStatus txStatus = startTransaction();\r
@@ -162,7 +166,7 @@ implements ICdmImport<FaunaEuropaeaImportConfigurator,FaunaEuropaeaImportState>
                        Collection<TaxonBase> taxonMapPart = taxonStore.objects(start, limit);\r
                        getTaxonService().saveTaxonAll(taxonMapPart);\r
                        taxonMapPart = null;\r
-                       //taxonStore.removeObjects(start, limit);\r
+                       taxonStore.removeObjects(start, limit);\r
                        \r
                        commitTransaction(txStatus);\r
 \r
index 3708651d442c88c16fe57795ee13f8a271502475..9358c0a9c467aa099aa08c89f9e70b2f45c71d70 100644 (file)
@@ -12,6 +12,7 @@ package eu.etaxonomy.cdm.io.faunaEuropaea;
 \r
 import java.util.HashMap;\r
 import java.util.Map;\r
+import java.util.UUID;\r
 \r
 import org.apache.log4j.Logger;\r
 \r
@@ -32,6 +33,8 @@ public class FaunaEuropaeaImportState extends ImportStateBase<FaunaEuropaeaImpor
        }\r
        \r
        private Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap = new HashMap();\r
+       private Map<UUID, UUID> childParentMap = new HashMap();\r
+       \r
        /* Highest taxon index in the FauEu database */\r
        private int highestTaxonIndex = 0;\r
 \r
@@ -63,6 +66,20 @@ public class FaunaEuropaeaImportState extends ImportStateBase<FaunaEuropaeaImpor
        public void setFauEuTaxonMap(Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap) {\r
                this.fauEuTaxonMap = fauEuTaxonMap;\r
        }\r
+\r
+       /**\r
+        * @return the childParentMap\r
+        */\r
+       public Map<UUID, UUID> getChildParentMap() {\r
+               return childParentMap;\r
+       }\r
+\r
+       /**\r
+        * @param childParentMap the childParentMap to set\r
+        */\r
+       public void setChildParentMap(Map<UUID, UUID> childParentMap) {\r
+               this.childParentMap = childParentMap;\r
+       }\r
        \r
 //     /* (non-Javadoc)\r
 //      * @see eu.etaxonomy.cdm.io.common.IoStateBase#initialize(eu.etaxonomy.cdm.io.common.IoConfiguratorBase)\r
index 5740513d6708164c3b2cee8adc19010852b47a1a..807ced4d43bd2ce4e508e26db1f4af0215758496 100644 (file)
@@ -78,7 +78,7 @@ public class FaunaEuropaeaNameImport extends FaunaEuropaeaImportBase  {
        private static final Logger logger = Logger.getLogger(FaunaEuropaeaNameImport.class);\r
 \r
        /* Max number of taxa to retrieve (for test purposes) */\r
-       private int maxTaxa = 5000;\r
+       private int maxTaxa = 20000;\r
        /* Max number of taxa to be saved in CDM DB with one service call */\r
        private int limit = 1000; // TODO: Make configurable\r
        /* Max number of taxa to be retrieved from CDM DB with one service call */\r
@@ -155,8 +155,8 @@ public class FaunaEuropaeaNameImport extends FaunaEuropaeaImportBase  {
                \r
                Map<String, MapWrapper<? extends CdmBase>> stores = state.getStores();\r
                Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap = state.getFauEuTaxonMap();\r
-               int highestTaxonIndex = state.getHighestTaxonIndex();\r
-               FaunaEuropaeaImportConfigurator fauEuConfig = state.getConfig();\r
+//             int highestTaxonIndex = state.getHighestTaxonIndex();\r
+//             FaunaEuropaeaImportConfigurator fauEuConfig = state.getConfig();\r
                boolean success = true;\r
                \r
                if(logger.isInfoEnabled()) { logger.info("Start making taxa..."); }\r
@@ -165,7 +165,7 @@ public class FaunaEuropaeaNameImport extends FaunaEuropaeaImportBase  {
                \r
                success = retrieveTaxa2TaxonStore(state, fauEuTaxonMap, Q_NO_RESTRICTION);\r
                success = processTaxaSecondPass(state, fauEuTaxonMap);\r
-               success = saveTaxa(stores, highestTaxonIndex, limit);\r
+               success = saveTaxa(stores, state.getHighestTaxonIndex(), limit);\r
                \r
                commitTransaction(txStatus);\r
                \r
@@ -245,7 +245,7 @@ public class FaunaEuropaeaNameImport extends FaunaEuropaeaImportBase  {
                                String autName = rs.getString("aut_name");\r
                                Rank rank = null;\r
                                UUID taxonBaseUuid = null;\r
-                               if (resultSetHasColumn(rs,"UUID")){\r
+                               if (resultSetHasColumn(rs, "UUID")){\r
                                        taxonBaseUuid = UUID.fromString(rs.getString("UUID"));\r
                                } else {\r
                                        taxonBaseUuid = UUID.randomUUID();\r
index 36394f98c9844c0de840480baf60456505610933..b56b16c1260bff4b719a5b17df6b61d822b80442 100644 (file)
@@ -23,6 +23,8 @@ import java.sql.ResultSet;
 import java.sql.SQLException;\r
 import java.util.Collection;\r
 import java.util.HashMap;\r
+import java.util.HashSet;\r
+import java.util.Iterator;\r
 import java.util.List;\r
 import java.util.Map;\r
 import java.util.Set;\r
@@ -79,7 +81,7 @@ public class FaunaEuropaeaRelTaxonIncludeImport extends FaunaEuropaeaImportBase
        /* Max number of taxa to retrieve (for test purposes) */\r
        private int maxTaxa = 0;\r
        /* Max number of taxa to be saved in CDM DB with one service call */\r
-       private int limit = 2000; // TODO: Make configurable\r
+       private int limit = 5000; // TODO: Make configurable\r
        /* Max number of taxa to be retrieved from CDM DB with one service call */\r
        private int limitRetrieve = 10000; // TODO: Make configurable\r
        /* Interval for progress info message when retrieving taxa */\r
@@ -127,7 +129,7 @@ public class FaunaEuropaeaRelTaxonIncludeImport extends FaunaEuropaeaImportBase
        /* (non-Javadoc)\r
         * @see eu.etaxonomy.cdm.io.common.CdmIoBase#doInvoke(eu.etaxonomy.cdm.io.common.IImportConfigurator, eu.etaxonomy.cdm.api.application.CdmApplicationController, java.util.Map)\r
         */\r
-       protected boolean doInvoke(FaunaEuropaeaImportState state) {                            \r
+       protected boolean doInvokeAlter(FaunaEuropaeaImportState state) {                               \r
                \r
                boolean success = true;\r
                \r
@@ -145,30 +147,83 @@ public class FaunaEuropaeaRelTaxonIncludeImport extends FaunaEuropaeaImportBase
        }\r
 \r
        \r
-       protected boolean doInvokeAlter(FaunaEuropaeaImportState state) {                               \r
+       protected boolean doInvoke(FaunaEuropaeaImportState state) {                            \r
                \r
-               Map<String, MapWrapper<? extends CdmBase>> stores = state.getStores();\r
-               MapWrapper<TaxonBase<?>> taxonStore = (MapWrapper<TaxonBase<?>>)stores.get(ICdmIO.TAXON_STORE);\r
-//             MapWrapper<TaxonNameBase<?,?>> taxonNamesStore = (MapWrapper<TaxonNameBase<?,?>>)stores.get(ICdmIO.TAXONNAME_STORE);\r
-               MapWrapper<TeamOrPersonBase> authorStore = (MapWrapper<TeamOrPersonBase>)stores.get(ICdmIO.TEAM_STORE);\r
-//             authorStore = null;\r
-//             Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap = new HashMap();\r
-               FaunaEuropaeaImportConfigurator fauEuConfig = state.getConfig();\r
                boolean success = true;\r
                \r
                if(logger.isInfoEnabled()) { logger.info("Start making taxa..."); }\r
                \r
-               success = retrieveTaxa(state, fauEuTaxonMap, Q_NO_RESTRICTION);\r
-//             success = processTaxaSecondPass(state, fauEuTaxonMap);\r
-               success = saveTaxa(stores, highestTaxonIndex, limit);\r
-//             success = saveTaxa(stores);\r
+               TransactionStatus txStatus = startTransaction();\r
+\r
+               success = retrieveUuids(state);\r
+               success = createRelationships(state);\r
                \r
+               commitTransaction(txStatus);\r
+\r
                logger.info("End making taxa...");\r
                return success;\r
        }\r
 \r
        \r
-       /** Retrieve tax from FauEu DB and build FauEuTaxonMap only */\r
+       /** Retrieve child-parent uuid map from CDM DB*/\r
+       private boolean retrieveUuids(FaunaEuropaeaImportState state) {\r
+\r
+               Map<UUID, UUID> childParentMap = state.getChildParentMap();\r
+               Map<String, MapWrapper<? extends CdmBase>> stores = state.getStores();\r
+               MapWrapper<TaxonBase> taxonStore = (MapWrapper<TaxonBase>)stores.get(ICdmIO.TAXON_STORE);\r
+               FaunaEuropaeaImportConfigurator fauEuConfig = state.getConfig();\r
+               ReferenceBase<?> sourceRef = fauEuConfig.getSourceReference();\r
+               Source source = fauEuConfig.getSource();\r
+               int i = 0;\r
+               boolean success = true;\r
+\r
+               try {\r
+\r
+                       String strQuery = \r
+                               " SELECT dbo.Taxon.UUID AS ChildUuid, Parent.UUID AS ParentUuid " +\r
+                               " FROM dbo.Taxon INNER JOIN dbo.Taxon AS Parent " +\r
+                               " ON dbo.Taxon.TAX_TAX_IDPARENT = Parent.TAX_ID " +\r
+                               " WHERE (dbo.Taxon.TAX_VALID <> 0) ";\r
+\r
+                       if (logger.isDebugEnabled()) {\r
+                               logger.debug("Query: " + strQuery);\r
+                       }\r
+\r
+                       ResultSet rs = source.getResultSet(strQuery);\r
+                       \r
+                       while (rs.next()) {\r
+                               \r
+                               if ((i++ % modCount) == 0 && i != 1 ) { \r
+                                       if(logger.isInfoEnabled()) {\r
+                                               logger.info("Taxa retrieved: " + (i-1)); \r
+                                       }\r
+                               }\r
+\r
+                               String childUuidStr = rs.getString("ChildUuid");\r
+                               String parentUuidStr = rs.getString("ParentUuid");\r
+                               UUID childUuid = UUID.fromString(childUuidStr);\r
+                               UUID parentUuid = UUID.fromString(parentUuidStr);\r
+                               \r
+                               if (!childParentMap.containsKey(childUuid)) {\r
+\r
+                                               childParentMap.put(childUuid, parentUuid);\r
+\r
+                               } else {\r
+                                       if(logger.isDebugEnabled()) {\r
+                                               logger.debug("Duplicated child UUID (" + childUuid + ")");\r
+                                       }\r
+                               }\r
+                       }\r
+\r
+               } catch (SQLException e) {\r
+                       logger.error("SQLException:" +  e);\r
+                       success = false;\r
+               }\r
+               return success;         \r
+       }\r
+\r
+       \r
+       /** Retrieve taxa from FauEu DB and build FauEuTaxonMap only */\r
        private boolean retrieveTaxa(FaunaEuropaeaImportState state,\r
                        Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap, int valid) {\r
 \r
@@ -380,7 +435,179 @@ public class FaunaEuropaeaRelTaxonIncludeImport extends FaunaEuropaeaImportBase
        }\r
 \r
        \r
-       /** Creates relationships if taxon bases are retrieved in chunks from CDM DB */\r
+       public Map<UUID, UUID> partMap(int border, Map<UUID, UUID> map) {\r
+\r
+               if (logger.isInfoEnabled()) {\r
+                       logger.info("Map size: " + map.size());\r
+               }\r
+               Set<Map.Entry<UUID, UUID>> entries = map.entrySet();\r
+               Iterator<Map.Entry<UUID, UUID>> entryIter = entries.iterator();\r
+               Map<UUID, UUID> partMap = new HashMap<UUID, UUID>();\r
+\r
+               for (int i = 0; i < border; i++) {\r
+                       //while (entryIter.hasNext()) {\r
+\r
+                       Map.Entry<UUID, UUID> mapEntry = (Map.Entry<UUID, UUID>)entryIter.next();\r
+                       partMap.put(mapEntry.getKey(), mapEntry.getValue());\r
+                       entryIter.remove();\r
+               }\r
+               \r
+               if (logger.isDebugEnabled()) {\r
+                       logger.debug("Map size: " + map.size());\r
+               }\r
+               return partMap;\r
+       }               \r
+\r
+//     public Map<UUID, UUID> childParentMap partMap(int start, int limit, Map<UUID, UUID> childParentMap) {\r
+//             \r
+//             int index = 0;\r
+//             \r
+//             for (int i = 0; i < limit; i++) {\r
+//                     \r
+//                     int j = start + i;\r
+//                     \r
+//                     Object object = childParentMap.get(j);\r
+//                     if(object != null) {\r
+//                             childParentMap.put(index, childParentMap.get(j));\r
+//                             index++;\r
+//                     } else {\r
+//                             if (logger.isDebugEnabled()) { logger.debug("Object (" + j + ") is null"); }\r
+//                     }\r
+//             }\r
+//             return (Map<UUID, UUID> childParentMap)internalPartMap.values();\r
+//     }\r
+\r
+       \r
+       /** Creates parent-child relationships.\r
+        * Parent-child pairs are retrieved via UUID from CDM DB */\r
+       private boolean createRelationships(FaunaEuropaeaImportState state) {\r
+\r
+               Map<String, MapWrapper<? extends CdmBase>> stores = state.getStores();\r
+               MapWrapper<TaxonBase> taxonStore = (MapWrapper<TaxonBase>)stores.get(ICdmIO.TAXON_STORE);\r
+               taxonStore.makeEmpty();\r
+               Map<UUID, UUID> childParentMap = state.getChildParentMap();\r
+               ReferenceBase<?> sourceRef = state.getConfig().getSourceReference();\r
+\r
+               int upperBorder = childParentMap.size();\r
+               int nbrOfBlocks = 0;\r
+\r
+               boolean success = true;\r
+\r
+               if (upperBorder < limit) {             // TODO: test with critical values\r
+                       limit = upperBorder;\r
+               } else {\r
+                       nbrOfBlocks = upperBorder / limit;\r
+               }\r
+\r
+               if(logger.isInfoEnabled()) { \r
+                       logger.info("number of child-parent pairs = " + upperBorder \r
+                                       + ", limit = " + limit\r
+                                       + ", number of blocks = " + nbrOfBlocks); \r
+               }\r
+\r
+               for (int j = 1; j <= nbrOfBlocks + 1; j++) {\r
+                       int offset = j - 1;\r
+                       int start = offset * limit;\r
+\r
+                       if(logger.isInfoEnabled()) { logger.info("Processing child-parent pairs: " + start + " - " + (start + limit - 1)); }\r
+\r
+                       if(logger.isInfoEnabled()) { \r
+                               logger.info("index = " + j \r
+                                               + ", offset = " + offset\r
+                                               + ", start = " + start); \r
+                       }\r
+\r
+                       if (j == nbrOfBlocks + 1) {\r
+                               limit = upperBorder - nbrOfBlocks * limit;\r
+                               if(logger.isInfoEnabled()) { logger.info("number of blocks = " + nbrOfBlocks + " limit = " + limit); }\r
+                       }\r
+\r
+                       TransactionStatus txStatus = startTransaction();\r
+\r
+//                     for (int k = 1; k <= start + offset; k++) {       // TODO: test borders\r
+//                     int k = 0;\r
+\r
+                       Map<UUID, UUID> childParentPartMap = partMap(limit, childParentMap);\r
+                       Set<TaxonBase> childSet = new HashSet<TaxonBase>(limit);\r
+                       \r
+                       if (logger.isInfoEnabled()) {\r
+                               logger.info("Partmap size: " + childParentPartMap.size());\r
+                       }\r
+\r
+                       for (UUID childUuid : childParentPartMap.keySet()) {\r
+//                     for (UUID childUuid : childParentMap.keySet()) {\r
+\r
+                               UUID parentUuid = childParentPartMap.get(childUuid);\r
+\r
+                               try {\r
+                                       TaxonBase<?> parent = getTaxonService().findByUuid(parentUuid);\r
+                                       if (logger.isTraceEnabled()) {\r
+                                               logger.trace("Parent find called (" + parentUuid + ")");\r
+                                       }\r
+                                       TaxonBase<?> child = getTaxonService().findByUuid(childUuid);\r
+                                       if (logger.isTraceEnabled()) {\r
+                                               logger.trace("Child find called (" + childUuid + ")");\r
+                                       }\r
+                                       Taxon parentTaxon = parent.deproxy(parent, Taxon.class);\r
+                                       Taxon childTaxon = parent.deproxy(child, Taxon.class);\r
+\r
+                                       if (childTaxon != null && parentTaxon != null) {\r
+                                               \r
+                                               makeTaxonomicallyIncluded(state, parentTaxon, childTaxon, sourceRef, null);\r
+                                               \r
+                                               if (logger.isDebugEnabled()) {\r
+                                                       logger.debug("Parent-child (" + parentUuid + "-" + childUuid + \r
+                                                       ") relationship created");\r
+                                               }\r
+                                               if (!childSet.contains(childTaxon)) {\r
+                                                       \r
+                                                       childSet.add(childTaxon);\r
+                                                       \r
+                                                       if (logger.isTraceEnabled()) {\r
+                                                               logger.trace("Child taxon (" + childUuid + ") added to Set");\r
+                                                       }\r
+                                                       \r
+                                               } else {\r
+                                                       if (logger.isDebugEnabled()) {\r
+                                                               logger.debug("Duplicated child taxon (" + childUuid + ")");\r
+                                                       }\r
+                                               }\r
+                                       } else {\r
+                                               if (logger.isDebugEnabled()) {\r
+                                                       logger.debug("Parent(" + parentUuid + ") or child (" + childUuid + " is null");\r
+                                               }\r
+                                       }\r
+                                       \r
+//                                     if (childTaxon != null && !childSet.contains(childTaxon)) {\r
+//                                             childSet.add(childTaxon);\r
+//                                             if (logger.isDebugEnabled()) {\r
+//                                                     logger.debug("Child taxon (" + childUuid + ") added to Set");\r
+//                                             }\r
+//                                     } else {\r
+//                                             if (logger.isDebugEnabled()) {\r
+//                                                     logger.debug("Duplicated child taxon (" + childUuid + ")");\r
+//                                             }\r
+//                                     }\r
+                                       \r
+                               } catch (Exception e) {\r
+                                       logger.error("Error creating taxonomically included relationship parent-child (" + \r
+                                                       parentUuid + "-" + childUuid + ")");\r
+                               }\r
+\r
+                       }\r
+                       getTaxonService().saveTaxonAll(childSet);\r
+                       commitTransaction(txStatus);\r
+               }\r
+               return success;\r
+       }\r
+                       \r
+                       \r
+                       \r
+       /** Creates parent-child relationships.\r
+        * Taxon bases are retrieved in blocks from CDM DB.\r
+        * Parent is retrieved from CDM DB via original source id if not found in current block.\r
+        * In case of blocksize = 20.000 this takes ca. 1-2 hours per block.\r
+        *  */\r
        private boolean createRelationships(FaunaEuropaeaTaxon fauEuTaxon,\r
                        TaxonBase<?> taxonBase, TaxonNameBase<?,?> taxonName, List<Taxon> taxa,\r
                        Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap, FaunaEuropaeaImportState state) {\r
@@ -488,6 +715,18 @@ public class FaunaEuropaeaRelTaxonIncludeImport extends FaunaEuropaeaImportBase
        }\r
 \r
        \r
+//     public int calculateBlockSize(int limit, int upperBorder) {\r
+//\r
+//             int blockSize = 0;\r
+//             \r
+//             if (upperBorder < limit) {\r
+//                     limit = upperBorder;\r
+//             } else {\r
+//                     blockSize = upperBorder / limit;\r
+//             }\r
+//     }\r
+       \r
+       \r
        private boolean processTaxaFromDatabase(FaunaEuropaeaImportState state,\r
                        Map<Integer, FaunaEuropaeaTaxon> fauEuTaxonMap) {\r
 \r