merge trunk into cdm3.3 branch
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / CdmMassIndexer.java
index 3f00b5b6b7541f851f6830b813afbdb38d9d0715..e461690f88780ed97f7291c511e456b63418f785 100644 (file)
@@ -12,8 +12,10 @@ package eu.etaxonomy.cdm.api.service.search;
 import java.io.IOException;
 import java.lang.reflect.Field;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Set;
 
 import org.apache.log4j.Logger;
 import org.apache.lucene.index.IndexReader;
@@ -24,6 +26,7 @@ import org.apache.lucene.search.spell.SpellChecker;
 import org.apache.lucene.store.Directory;
 import org.hibernate.CacheMode;
 import org.hibernate.FlushMode;
+import org.hibernate.ObjectNotFoundException;
 import org.hibernate.ScrollMode;
 import org.hibernate.ScrollableResults;
 import org.hibernate.Session;
@@ -60,9 +63,15 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase;
 @Transactional
 public class CdmMassIndexer implements ICdmMassIndexer {
 
+       private Set<Class<? extends CdmBase>> indexedClasses = new HashSet<Class<? extends CdmBase>>();
     public static final Logger logger = Logger.getLogger(CdmMassIndexer.class);
 
-    private static final int BATCH_SIZE = 100;
+    /*
+     *      !!! DO NOTE CHANGE THIS !!!
+     *
+     * batch_size optimized for 200MB of heap memory
+     */
+    private static final int BATCH_SIZE = 200;
 
     public HibernateTransactionManager transactionManager;
 
@@ -89,7 +98,7 @@ public class CdmMassIndexer implements ICdmMassIndexer {
         Long countResult = countEntities(type);
         int numOfBatches = calculateNumOfBatches(countResult);
 
-        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, numOfBatches);
+        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
         subMonitor.beginTask("Indexing " + type.getSimpleName(), numOfBatches);
 
         // Scrollable results will avoid loading too many objects in memory
@@ -103,12 +112,17 @@ public class CdmMassIndexer implements ICdmMassIndexer {
                 fullTextSession.index(results.get(0)); // index each element
                 if (index % BATCH_SIZE == 0 || index == countResult) {
                     batchesWorked++;
-                    fullTextSession.flushToIndexes(); // apply changes to indexes
-                    fullTextSession.clear(); // clear since the queue is processed
-                    //                calculateNumOfBatches(index == countResult ? countResult : index);
-                    logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
-                    subMonitor.internalWorked(1);
-                    //if(index / BATCH_SIZE > 10 ) break;
+                    try {
+                        fullTextSession.flushToIndexes(); // apply changes to indexes
+                    } catch(ObjectNotFoundException e){
+                        // TODO report this issue to progress monitor once it can report on errors
+                        logger.error("possibly invalid data, thus skipping this batch and continuing with next one", e);
+                    } finally {
+                        fullTextSession.clear(); // clear since the queue is processed
+                        getSession().clear(); // clear session to free memory
+                        subMonitor.worked(1);
+                        logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
+                    }
                 }
             }
         } catch (RuntimeException e) {
@@ -122,68 +136,68 @@ public class CdmMassIndexer implements ICdmMassIndexer {
     }
 
     /**
-     * 
-     * 
+     *
+     *
      * @param type
      * @param monitor
      */
-    protected <T extends CdmBase> void createDictionary(Class<T> type, IProgressMonitor monitor)  {        
+    protected <T extends CdmBase> void createDictionary(Class<T> type, IProgressMonitor monitor)  {
         String indexName = null;
         if(type.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
-               indexName = type.getAnnotation(org.hibernate.search.annotations.Indexed.class).index();
+            indexName = type.getAnnotation(org.hibernate.search.annotations.Indexed.class).index();
         } else {
-               //TODO:give some indication that this class is infact not indexed
-               return;
+            //TODO:give some indication that this class is infact not indexed
+            return;
         }
         SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)Search.getFullTextSession(getSession()).getSearchFactory();
         IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(indexName);
         IndexReader indexReader = searchFactory.getIndexReaderAccessor().open(type);
-       List<String> idFields = getIndexedDeclaredFields(type);
-               
+        List<String> idFields = getIndexedDeclaredFields(type);
+
         monitor.subTask("creating dictionary " + type.getSimpleName());
-               
+
         SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
         subMonitor.beginTask("Creating dictionary " + type.getSimpleName(), 1);
-        
+
         Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
         SpellChecker spellChecker = null;
-       try {                   
-               spellChecker = new SpellChecker(directory);
-               Iterator<String> itr = idFields.iterator();
-               while(itr.hasNext()) {
-                       String indexedField = itr.next();
-                       logger.info("creating dictionary for field " + indexedField);                           
-                       Dictionary dictionary = new LuceneDictionary(indexReader, indexedField);
-                       IndexWriterConfig iwc = new IndexWriterConfig(Configuration.luceneVersion, searchFactory.getAnalyzer(type));
-                       spellChecker.indexDictionary(dictionary, iwc, true);                            
-               }
-               subMonitor.internalWorked(1);
-       } catch (IOException e) {
-               logger.error("IOException when creating dictionary", e);
-               //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+        try {
+            spellChecker = new SpellChecker(directory);
+            Iterator<String> itr = idFields.iterator();
+            while(itr.hasNext()) {
+                String indexedField = itr.next();
+                logger.info("creating dictionary for field " + indexedField);
+                Dictionary dictionary = new LuceneDictionary(indexReader, indexedField);
+                IndexWriterConfig iwc = new IndexWriterConfig(Configuration.luceneVersion, searchFactory.getAnalyzer(type));
+                spellChecker.indexDictionary(dictionary, iwc, true);
+            }
+            subMonitor.internalWorked(1);
+        } catch (IOException e) {
+            logger.error("IOException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
             monitor.done();
-       } catch (RuntimeException e) {
-               logger.error("RuntimeException when creating dictionary", e);
-               //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+        } catch (RuntimeException e) {
+            logger.error("RuntimeException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
             monitor.done();
-       } finally {
-               searchFactory.getIndexReaderAccessor().close(indexReader);      
-       }
-       if (spellChecker != null) {
-               try {
-                       logger.info("closing spellchecker ");
-                       spellChecker.close();
-               } catch (IOException e) {
-                       logger.error("IOException when closing spellchecker", e);
-               }
-       }
-       
-       logger.info("end creating dictionary " + type.getName());
-       subMonitor.done();
+        } finally {
+            searchFactory.getIndexReaderAccessor().close(indexReader);
+        }
+        if (spellChecker != null) {
+            try {
+                logger.info("closing spellchecker ");
+                spellChecker.close();
+            } catch (IOException e) {
+                logger.error("IOException when closing spellchecker", e);
+            }
+        }
+
+        logger.info("end creating dictionary " + type.getName());
+        subMonitor.done();
     }
-    
+
     /**
      * @param countResult
      * @return
@@ -209,29 +223,29 @@ public class CdmMassIndexer implements ICdmMassIndexer {
         logger.info("purging " + type.getName());
         fullTextSession.purgeAll(type);
 
-        
-        SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)Search.getFullTextSession(getSession()).getSearchFactory();
-        IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(type.getName());        
+
+        SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)fullTextSession.getSearchFactory();
+        IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(type.getName());
         Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
         SpellChecker spellChecker = null;
-       try {                   
-               spellChecker = new SpellChecker(directory);
-               spellChecker.clearIndex();
-       } catch (IOException e) {
-               logger.error("IOException when creating dictionary", e);
-               //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
-            //monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
-            //monitor.done();
-       }
-       
-       if (spellChecker != null) {
-               try {
-                       logger.info("closing spellchecker ");
-                       spellChecker.close();
-               } catch (IOException e) {
-                       logger.error("IOException when closing spellchecker", e);
-               }
-       }
+        try {
+            spellChecker = new SpellChecker(directory);
+            spellChecker.clearIndex();
+        } catch (IOException e) {
+            logger.error("IOException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+            monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
+            monitor.done();
+        }
+
+        if (spellChecker != null) {
+            try {
+                logger.info("closing spellchecker ");
+                spellChecker.close();
+            } catch (IOException e) {
+                logger.error("IOException when closing spellchecker", e);
+            }
+        }
     }
 
 
@@ -246,18 +260,27 @@ public class CdmMassIndexer implements ICdmMassIndexer {
         }
 
         monitor.setTaskName("CdmMassIndexer");
-        int steps = totalBatchCount() + 1; // +1 for optimize
-        monitor.beginTask("Reindexing " + indexedClasses().length + " classes", steps);
+        int steps = indexedClasses().size() + 1; // +1 for optimize
+        monitor.beginTask("Reindexing " + indexedClasses().size() + " classes", steps);
 
-        for(Class type : indexedClasses()){
+        for(Class<? extends CdmBase> type : indexedClasses()){
             reindex(type, monitor);
         }
-        optimize(monitor);
+
+        monitor.subTask("Optimizing Index");
+        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
+        subMonitor.beginTask("Optimizing Index",1);
+        optimize();        
+        subMonitor.worked(1);
+        logger.info("end index optimization");
+        subMonitor.done();
+        
+        //monitor.worked(1);
         monitor.done();
     }
 
-       @Override
-       public void createDictionary(IProgressMonitor monitor) {
+    @Override
+    public void createDictionary(IProgressMonitor monitor) {
         if(monitor == null){
             monitor = new NullProgressMonitor();
         }
@@ -267,24 +290,18 @@ public class CdmMassIndexer implements ICdmMassIndexer {
         monitor.beginTask("Creating Dictionary " + dictionaryClasses().length + " classes", steps);
 
         for(Class type : dictionaryClasses()){
-               createDictionary(type, monitor);
+            createDictionary(type, monitor);
         }
-      
+
         monitor.done();
-               
-       }
-    protected void optimize(IProgressMonitor monitor) {
 
-        monitor.subTask("optimizing");
-        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
-        subMonitor.beginTask("optimizing", 1);
+    }
+    protected void optimize() {
 
         FullTextSession fullTextSession = Search.getFullTextSession(getSession());
         fullTextSession.getSearchFactory().optimize();
         fullTextSession.flushToIndexes();
         fullTextSession.clear();
-
-        subMonitor.done();
     }
 
     /**
@@ -309,72 +326,86 @@ public class CdmMassIndexer implements ICdmMassIndexer {
         }
 
         monitor.setTaskName("CdmMassIndexer");
-        int steps = indexedClasses().length; // +1 for optimize
-        monitor.beginTask("Purging " + indexedClasses().length + " classes", steps);
+        int steps = indexedClasses().size() + 1; // +1 for optimize
+        monitor.beginTask("Purging " + indexedClasses().size() + " classes", steps);
 
-        for(Class type : indexedClasses()){
+        for(Class<? extends CdmBase> type : indexedClasses()){
             purge(type, monitor);
             monitor.worked(1);
         }
+        // need to flush to the index before optimizing
+        // the purge method is not doing the flushing by itself
+        FullTextSession fullTextSession = Search.getFullTextSession(getSession());
+        fullTextSession.flushToIndexes();
 
-//        // need to commit and start new transaction before optimizing
-//        FullTextSession fullTextSession = Search.getFullTextSession(getSession());
-//        Transaction tx = fullTextSession.getTransaction();
-//        tx.commit();
-//        fullTextSession.beginTransaction(); // will be committed automatically at the end of this method since this class is transactional
-
-//        optimize(monitor);
+        // optimize
+        optimize();
+        monitor.worked(1);
 
+        // done
         monitor.done();
     }
 
-    
+
     /**
      * Returns a list of declared indexable fields within a class through reflection.
-     * 
+     *
      * @param clazz
      * @return
      */
     private List<String> getIndexedDeclaredFields(Class clazz) {
-       List<String> idFields = new ArrayList<String>();
-       if(clazz.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
-               Field[] declaredFields = clazz.getDeclaredFields();             
-               for(int i=0;i<declaredFields.length;i++ ) {
-                       logger.info("checking field " + declaredFields[i].getName());
-                       if(declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Field.class) ||
-                                       declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Fields.class)) {
-                               idFields.add(declaredFields[i].getName());
-                               logger.info("adding field " + declaredFields[i].getName());
-                       }
-               }
-       }       
-       return idFields;
+        List<String> idFields = new ArrayList<String>();
+        if(clazz.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
+            Field[] declaredFields = clazz.getDeclaredFields();
+            for(int i=0;i<declaredFields.length;i++ ) {
+                logger.info("checking field " + declaredFields[i].getName());
+                if(declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Field.class) ||
+                        declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Fields.class)) {
+                    idFields.add(declaredFields[i].getName());
+                    logger.info("adding field " + declaredFields[i].getName());
+                }
+            }
+        }
+        return idFields;
     }
     /**
      * @return
      */
+    @SuppressWarnings("unchecked")
     @Override
-    public Class[] indexedClasses() {
-        return new Class[] {
-                DescriptionElementBase.class,
-                Classification.class,
-                TaxonBase.class,
-                TaxonNameBase.class,
-                SpecimenOrObservationBase.class
-                };
+    public Set<Class<? extends CdmBase>> indexedClasses() {
+       // if no indexed classes have been 'manually' set then
+       // the default is the full list
+       if(indexedClasses.size() == 0) {
+               indexedClasses.add(DescriptionElementBase.class);
+               indexedClasses.add(TaxonBase.class);                    
+               indexedClasses.add(Classification.class);
+               indexedClasses.add(TaxonNameBase.class);
+               indexedClasses.add(SpecimenOrObservationBase.class);
+       }
+        return indexedClasses;
     }
-    
+
     /**
      * @return
      */
     @Override
     public Class[] dictionaryClasses() {
-        return new Class[] {               
-                       NonViralName.class
+        return new Class[] {
+                NonViralName.class
                 };
     }
 
+       @Override
+       public void addToIndexedClasses(Class<? extends CdmBase> cdmBaseClass) {
+               indexedClasses.add(cdmBaseClass);
+               
+       }
+
+       @Override
+       public void clearIndexedClasses() {
+               indexedClasses.clear();         
+       }
 
-    
 
 }
\ No newline at end of file