merge trunk into cdm3.3 branch

[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / CdmMassIndexer.java
diff --git a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/CdmMassIndexer.java b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/CdmMassIndexer.java

index db4ecf3cee385a3dc96def7461ad74ea1fe45527..e461690f88780ed97f7291c511e456b63418f785 100644 (file)
--- a/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/CdmMassIndexer.java
+++ b/cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/CdmMassIndexer.java
@@ -9,14 +9,32 @@
  */
  package eu.etaxonomy.cdm.api.service.search;
  
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
  import org.apache.log4j.Logger;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.LuceneDictionary;
+import org.apache.lucene.search.spell.SpellChecker;
+import org.apache.lucene.store.Directory;
  import org.hibernate.CacheMode;
  import org.hibernate.FlushMode;
+import org.hibernate.ObjectNotFoundException;
  import org.hibernate.ScrollMode;
  import org.hibernate.ScrollableResults;
  import org.hibernate.Session;
  import org.hibernate.search.FullTextSession;
  import org.hibernate.search.Search;
+import org.hibernate.search.engine.spi.SearchFactoryImplementor;
+import org.hibernate.search.indexes.impl.DirectoryBasedIndexManager;
+import org.hibernate.search.indexes.spi.IndexManager;
  import org.springframework.beans.factory.annotation.Autowired;
  import org.springframework.orm.hibernate4.HibernateTransactionManager;
  import org.springframework.stereotype.Component;
@@ -27,8 +45,12 @@ import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
  import eu.etaxonomy.cdm.common.monitor.NullProgressMonitor;
  import eu.etaxonomy.cdm.common.monitor.RestServiceProgressMonitor;
  import eu.etaxonomy.cdm.common.monitor.SubProgressMonitor;
+import eu.etaxonomy.cdm.config.Configuration;
  import eu.etaxonomy.cdm.model.common.CdmBase;
  import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
+import eu.etaxonomy.cdm.model.name.NonViralName;
+import eu.etaxonomy.cdm.model.name.TaxonNameBase;
+import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
  import eu.etaxonomy.cdm.model.taxon.Classification;
  import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  
@@ -41,9 +63,15 @@ import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  @Transactional
  public class CdmMassIndexer implements ICdmMassIndexer {
  
+       private Set<Class<? extends CdmBase>> indexedClasses = new HashSet<Class<? extends CdmBase>>();
      public static final Logger logger = Logger.getLogger(CdmMassIndexer.class);
  
-    private static final int BATCH_SIZE = 100;
+    /*
+     *      !!! DO NOTE CHANGE THIS !!!
+     *
+     * batch_size optimized for 200MB of heap memory
+     */
+    private static final int BATCH_SIZE = 200;
  
      public HibernateTransactionManager transactionManager;
  
@@ -70,7 +98,7 @@ public class CdmMassIndexer implements ICdmMassIndexer {
          Long countResult = countEntities(type);
          int numOfBatches = calculateNumOfBatches(countResult);
  
-        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, numOfBatches);
+        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
          subMonitor.beginTask("Indexing " + type.getSimpleName(), numOfBatches);
  
          // Scrollable results will avoid loading too many objects in memory
@@ -84,12 +112,17 @@ public class CdmMassIndexer implements ICdmMassIndexer {
                  fullTextSession.index(results.get(0)); // index each element
                  if (index % BATCH_SIZE == 0 || index == countResult) {
                      batchesWorked++;
-                    fullTextSession.flushToIndexes(); // apply changes to indexes
-                    fullTextSession.clear(); // clear since the queue is processed
-                    //                calculateNumOfBatches(index == countResult ? countResult : index);
-                    logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
-                    subMonitor.internalWorked(1);
-                    //if(index / BATCH_SIZE > 10 ) break;
+                    try {
+                        fullTextSession.flushToIndexes(); // apply changes to indexes
+                    } catch(ObjectNotFoundException e){
+                        // TODO report this issue to progress monitor once it can report on errors
+                        logger.error("possibly invalid data, thus skipping this batch and continuing with next one", e);
+                    } finally {
+                        fullTextSession.clear(); // clear since the queue is processed
+                        getSession().clear(); // clear session to free memory
+                        subMonitor.worked(1);
+                        logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
+                    }
                  }
              }
          } catch (RuntimeException e) {
@@ -102,6 +135,69 @@ public class CdmMassIndexer implements ICdmMassIndexer {
          subMonitor.done();
      }
  
+    /**
+     *
+     *
+     * @param type
+     * @param monitor
+     */
+    protected <T extends CdmBase> void createDictionary(Class<T> type, IProgressMonitor monitor)  {
+        String indexName = null;
+        if(type.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
+            indexName = type.getAnnotation(org.hibernate.search.annotations.Indexed.class).index();
+        } else {
+            //TODO:give some indication that this class is infact not indexed
+            return;
+        }
+        SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)Search.getFullTextSession(getSession()).getSearchFactory();
+        IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(indexName);
+        IndexReader indexReader = searchFactory.getIndexReaderAccessor().open(type);
+        List<String> idFields = getIndexedDeclaredFields(type);
+
+        monitor.subTask("creating dictionary " + type.getSimpleName());
+
+        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
+        subMonitor.beginTask("Creating dictionary " + type.getSimpleName(), 1);
+
+        Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
+        SpellChecker spellChecker = null;
+        try {
+            spellChecker = new SpellChecker(directory);
+            Iterator<String> itr = idFields.iterator();
+            while(itr.hasNext()) {
+                String indexedField = itr.next();
+                logger.info("creating dictionary for field " + indexedField);
+                Dictionary dictionary = new LuceneDictionary(indexReader, indexedField);
+                IndexWriterConfig iwc = new IndexWriterConfig(Configuration.luceneVersion, searchFactory.getAnalyzer(type));
+                spellChecker.indexDictionary(dictionary, iwc, true);
+            }
+            subMonitor.internalWorked(1);
+        } catch (IOException e) {
+            logger.error("IOException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+            monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
+            monitor.done();
+        } catch (RuntimeException e) {
+            logger.error("RuntimeException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+            monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
+            monitor.done();
+        } finally {
+            searchFactory.getIndexReaderAccessor().close(indexReader);
+        }
+        if (spellChecker != null) {
+            try {
+                logger.info("closing spellchecker ");
+                spellChecker.close();
+            } catch (IOException e) {
+                logger.error("IOException when closing spellchecker", e);
+            }
+        }
+
+        logger.info("end creating dictionary " + type.getName());
+        subMonitor.done();
+    }
+
      /**
       * @param countResult
       * @return
@@ -126,6 +222,30 @@ public class CdmMassIndexer implements ICdmMassIndexer {
          FullTextSession fullTextSession = Search.getFullTextSession(getSession());
          logger.info("purging " + type.getName());
          fullTextSession.purgeAll(type);
+
+
+        SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)fullTextSession.getSearchFactory();
+        IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(type.getName());
+        Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
+        SpellChecker spellChecker = null;
+        try {
+            spellChecker = new SpellChecker(directory);
+            spellChecker.clearIndex();
+        } catch (IOException e) {
+            logger.error("IOException when creating dictionary", e);
+            //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
+            monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
+            monitor.done();
+        }
+
+        if (spellChecker != null) {
+            try {
+                logger.info("closing spellchecker ");
+                spellChecker.close();
+            } catch (IOException e) {
+                logger.error("IOException when closing spellchecker", e);
+            }
+        }
      }
  
  
@@ -140,26 +260,48 @@ public class CdmMassIndexer implements ICdmMassIndexer {
          }
  
          monitor.setTaskName("CdmMassIndexer");
-        int steps = totalBatchCount() + 1; // +1 for optimize
-        monitor.beginTask("Reindexing " + indexedClasses().length + " classes", steps);
+        int steps = indexedClasses().size() + 1; // +1 for optimize
+        monitor.beginTask("Reindexing " + indexedClasses().size() + " classes", steps);
  
-        for(Class type : indexedClasses()){
+        for(Class<? extends CdmBase> type : indexedClasses()){
              reindex(type, monitor);
          }
-        optimize(monitor);
+
+        monitor.subTask("Optimizing Index");
+        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
+        subMonitor.beginTask("Optimizing Index",1);
+        optimize();        
+        subMonitor.worked(1);
+        logger.info("end index optimization");
+        subMonitor.done();
+        
+        //monitor.worked(1);
          monitor.done();
      }
  
-    protected void optimize(IProgressMonitor monitor) {
+    @Override
+    public void createDictionary(IProgressMonitor monitor) {
+        if(monitor == null){
+            monitor = new NullProgressMonitor();
+        }
+
+        monitor.setTaskName("CdmMassIndexer_Dictionary");
+        int steps = dictionaryClasses().length; // +1 for optimize
+        monitor.beginTask("Creating Dictionary " + dictionaryClasses().length + " classes", steps);
  
-        monitor.subTask("optimizing");
-        SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
+        for(Class type : dictionaryClasses()){
+            createDictionary(type, monitor);
+        }
+
+        monitor.done();
+
+    }
+    protected void optimize() {
  
          FullTextSession fullTextSession = Search.getFullTextSession(getSession());
          fullTextSession.getSearchFactory().optimize();
-
-        subMonitor.beginTask("optimizing", 1);
-        subMonitor.done();
+        fullTextSession.flushToIndexes();
+        fullTextSession.clear();
      }
  
      /**
@@ -184,34 +326,86 @@ public class CdmMassIndexer implements ICdmMassIndexer {
          }
  
          monitor.setTaskName("CdmMassIndexer");
-        int steps = indexedClasses().length + 1; // +1 for optimize
-        monitor.beginTask("Purging " + indexedClasses().length + " classes", steps);
+        int steps = indexedClasses().size() + 1; // +1 for optimize
+        monitor.beginTask("Purging " + indexedClasses().size() + " classes", steps);
  
-        for(Class type : indexedClasses()){
+        for(Class<? extends CdmBase> type : indexedClasses()){
              purge(type, monitor);
              monitor.worked(1);
          }
+        // need to flush to the index before optimizing
+        // the purge method is not doing the flushing by itself
+        FullTextSession fullTextSession = Search.getFullTextSession(getSession());
+        fullTextSession.flushToIndexes();
  
-//        // need to commit and start new transaction before optimizing
-//        FullTextSession fullTextSession = Search.getFullTextSession(getSession());
-//        Transaction tx = fullTextSession.getTransaction();
-//        tx.commit();
-//        fullTextSession.beginTransaction(); // will be committed automatically at the end of this method since this class is transactional
-
-//        optimize(monitor);
+        // optimize
+        optimize();
+        monitor.worked(1);
  
+        // done
          monitor.done();
      }
  
+
+    /**
+     * Returns a list of declared indexable fields within a class through reflection.
+     *
+     * @param clazz
+     * @return
+     */
+    private List<String> getIndexedDeclaredFields(Class clazz) {
+        List<String> idFields = new ArrayList<String>();
+        if(clazz.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
+            Field[] declaredFields = clazz.getDeclaredFields();
+            for(int i=0;i<declaredFields.length;i++ ) {
+                logger.info("checking field " + declaredFields[i].getName());
+                if(declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Field.class) ||
+                        declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Fields.class)) {
+                    idFields.add(declaredFields[i].getName());
+                    logger.info("adding field " + declaredFields[i].getName());
+                }
+            }
+        }
+        return idFields;
+    }
+    /**
+     * @return
+     */
+    @SuppressWarnings("unchecked")
+    @Override
+    public Set<Class<? extends CdmBase>> indexedClasses() {
+       // if no indexed classes have been 'manually' set then
+       // the default is the full list
+       if(indexedClasses.size() == 0) {
+               indexedClasses.add(DescriptionElementBase.class);
+               indexedClasses.add(TaxonBase.class);                    
+               indexedClasses.add(Classification.class);
+               indexedClasses.add(TaxonNameBase.class);
+               indexedClasses.add(SpecimenOrObservationBase.class);
+       }
+        return indexedClasses;
+    }
+
      /**
       * @return
       */
      @Override
-    public Class[] indexedClasses() {
+    public Class[] dictionaryClasses() {
          return new Class[] {
-                DescriptionElementBase.class,
-                Classification.class,
-                TaxonBase.class
+                NonViralName.class
                  };
      }
+
+       @Override
+       public void addToIndexedClasses(Class<? extends CdmBase> cdmBaseClass) {
+               indexedClasses.add(cdmBaseClass);
+               
+       }
+
+       @Override
+       public void clearIndexedClasses() {
+               indexedClasses.clear();         
+       }
+
+
  }
 \ No newline at end of file