cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/CdmMassIndexer.java

   1 // $Id$
   2 /**
   3 * Copyright (C) 2011 EDIT
   4 * European Distributed Institute of Taxonomy
   5 * http://www.e-taxonomy.eu
   6 *
   7 * The contents of this file are subject to the Mozilla Public License Version 1.1
   8 * See LICENSE.TXT at the top of this package for the full license terms.
   9 */
  10 package eu.etaxonomy.cdm.api.service.search;
  11
  12 import java.io.IOException;
  13 import java.lang.reflect.Field;
  14 import java.util.ArrayList;
  15 import java.util.Iterator;
  16 import java.util.List;
  17
  18 import org.apache.log4j.Logger;
  19 import org.apache.lucene.index.IndexReader;
  20 import org.apache.lucene.index.IndexWriterConfig;
  21 import org.apache.lucene.search.spell.Dictionary;
  22 import org.apache.lucene.search.spell.LuceneDictionary;
  23 import org.apache.lucene.search.spell.SpellChecker;
  24 import org.apache.lucene.store.Directory;
  25 import org.hibernate.CacheMode;
  26 import org.hibernate.FlushMode;
  27 import org.hibernate.ScrollMode;
  28 import org.hibernate.ScrollableResults;
  29 import org.hibernate.Session;
  30 import org.hibernate.search.FullTextSession;
  31 import org.hibernate.search.Search;
  32 import org.hibernate.search.engine.spi.SearchFactoryImplementor;
  33 import org.hibernate.search.indexes.impl.DirectoryBasedIndexManager;
  34 import org.hibernate.search.indexes.spi.IndexManager;
  35 import org.springframework.beans.factory.annotation.Autowired;
  36 import org.springframework.orm.hibernate4.HibernateTransactionManager;
  37 import org.springframework.stereotype.Component;
  38 import org.springframework.transaction.PlatformTransactionManager;
  39 import org.springframework.transaction.annotation.Transactional;
  40
  41 import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
  42 import eu.etaxonomy.cdm.common.monitor.NullProgressMonitor;
  43 import eu.etaxonomy.cdm.common.monitor.RestServiceProgressMonitor;
  44 import eu.etaxonomy.cdm.common.monitor.SubProgressMonitor;
  45 import eu.etaxonomy.cdm.config.Configuration;
  46 import eu.etaxonomy.cdm.model.common.CdmBase;
  47 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
  48 import eu.etaxonomy.cdm.model.name.NonViralName;
  49 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
  50 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
  51 import eu.etaxonomy.cdm.model.taxon.Classification;
  52 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
  53
  54 /**
  55  * @author Andreas Kohlbecker
  56  * @date Dec 15, 2011
  57  *
  58  */
  59 @Component
  60 @Transactional
  61 public class CdmMassIndexer implements ICdmMassIndexer {
  62
  63     public static final Logger logger = Logger.getLogger(CdmMassIndexer.class);
  64
  65     private static final int BATCH_SIZE = 2000;
  66
  67     public HibernateTransactionManager transactionManager;
  68
  69     @Autowired
  70     public void setTransactionManager(PlatformTransactionManager transactionManager) {
  71         this.transactionManager = (HibernateTransactionManager)transactionManager;
  72     }
  73
  74     protected Session getSession(){
  75         Session session = transactionManager.getSessionFactory().getCurrentSession();
  76         return session;
  77     }
  78
  79     protected <T extends CdmBase>void reindex(Class<T> type, IProgressMonitor monitor) {
  80
  81         FullTextSession fullTextSession = Search.getFullTextSession(getSession());
  82
  83         fullTextSession.setFlushMode(FlushMode.MANUAL);
  84         fullTextSession.setCacheMode(CacheMode.IGNORE);
  85
  86         logger.info("start indexing " + type.getName());
  87         monitor.subTask("indexing " + type.getSimpleName());
  88
  89         Long countResult = countEntities(type);
  90         int numOfBatches = calculateNumOfBatches(countResult);
  91
  92         SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
  93         subMonitor.beginTask("Indexing " + type.getSimpleName(), numOfBatches);
  94
  95         // Scrollable results will avoid loading too many objects in memory
  96         ScrollableResults results = fullTextSession.createCriteria(type).setFetchSize(BATCH_SIZE).scroll(ScrollMode.FORWARD_ONLY);
  97         long index = 0;
  98         int batchesWorked = 0;
  99
 100         try {
 101             while (results.next()) {
 102                 index++;
 103                 fullTextSession.index(results.get(0)); // index each element
 104                 if (index % BATCH_SIZE == 0 || index == countResult) {
 105                     batchesWorked++;
 106                     fullTextSession.flushToIndexes(); // apply changes to indexes
 107                     fullTextSession.clear(); // clear since the queue is processed
 108                     subMonitor.worked(1);
 109                     logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
 110                     //if(index / BATCH_SIZE > 10 ) break;
 111                 }
 112             }
 113         } catch (RuntimeException e) {
 114             //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
 115             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
 116             monitor.done();
 117             throw       e;
 118         }
 119         logger.info("end indexing " + type.getName());
 120         subMonitor.done();
 121     }
 122
 123     /**
 124      *
 125      *
 126      * @param type
 127      * @param monitor
 128      */
 129     protected <T extends CdmBase> void createDictionary(Class<T> type, IProgressMonitor monitor)  {
 130         String indexName = null;
 131         if(type.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
 132                 indexName = type.getAnnotation(org.hibernate.search.annotations.Indexed.class).index();
 133         } else {
 134                 //TODO:give some indication that this class is infact not indexed
 135                 return;
 136         }
 137         SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)Search.getFullTextSession(getSession()).getSearchFactory();
 138         IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(indexName);
 139         IndexReader indexReader = searchFactory.getIndexReaderAccessor().open(type);
 140         List<String> idFields = getIndexedDeclaredFields(type);
 141
 142         monitor.subTask("creating dictionary " + type.getSimpleName());
 143
 144         SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
 145         subMonitor.beginTask("Creating dictionary " + type.getSimpleName(), 1);
 146
 147         Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
 148         SpellChecker spellChecker = null;
 149         try {
 150                 spellChecker = new SpellChecker(directory);
 151                 Iterator<String> itr = idFields.iterator();
 152                 while(itr.hasNext()) {
 153                         String indexedField = itr.next();
 154                         logger.info("creating dictionary for field " + indexedField);
 155                         Dictionary dictionary = new LuceneDictionary(indexReader, indexedField);
 156                         IndexWriterConfig iwc = new IndexWriterConfig(Configuration.luceneVersion, searchFactory.getAnalyzer(type));
 157                         spellChecker.indexDictionary(dictionary, iwc, true);
 158                 }
 159                 subMonitor.internalWorked(1);
 160         } catch (IOException e) {
 161                 logger.error("IOException when creating dictionary", e);
 162                 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
 163             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
 164             monitor.done();
 165         } catch (RuntimeException e) {
 166                 logger.error("RuntimeException when creating dictionary", e);
 167                 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
 168             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
 169             monitor.done();
 170         } finally {
 171                 searchFactory.getIndexReaderAccessor().close(indexReader);
 172         }
 173         if (spellChecker != null) {
 174                 try {
 175                         logger.info("closing spellchecker ");
 176                         spellChecker.close();
 177                 } catch (IOException e) {
 178                         logger.error("IOException when closing spellchecker", e);
 179                 }
 180         }
 181
 182         logger.info("end creating dictionary " + type.getName());
 183         subMonitor.done();
 184     }
 185
 186     /**
 187      * @param countResult
 188      * @return
 189      */
 190     private int calculateNumOfBatches(Long countResult) {
 191         Long numOfBatches =  countResult > 0 ? ((countResult-1)/BATCH_SIZE)+1 : 0;
 192         return numOfBatches.intValue();
 193     }
 194
 195     /**
 196      * @param type
 197      * @return
 198      */
 199     private <T> Long countEntities(Class<T> type) {
 200         Object countResultObj = getSession().createQuery("select count(*) from " + type.getName()).uniqueResult();
 201         Long countResult = (Long)countResultObj;
 202         return countResult;
 203     }
 204
 205     protected <T extends CdmBase>void purge(Class<T> type, IProgressMonitor monitor) {
 206
 207         FullTextSession fullTextSession = Search.getFullTextSession(getSession());
 208         logger.info("purging " + type.getName());
 209         fullTextSession.purgeAll(type);
 210
 211
 212         SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)fullTextSession.getSearchFactory();
 213         IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(type.getName());
 214         Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
 215         SpellChecker spellChecker = null;
 216         try {
 217                 spellChecker = new SpellChecker(directory);
 218                 spellChecker.clearIndex();
 219         } catch (IOException e) {
 220                 logger.error("IOException when creating dictionary", e);
 221                 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
 222             monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
 223             monitor.done();
 224         }
 225
 226         if (spellChecker != null) {
 227                 try {
 228                         logger.info("closing spellchecker ");
 229                         spellChecker.close();
 230                 } catch (IOException e) {
 231                         logger.error("IOException when closing spellchecker", e);
 232                 }
 233         }
 234     }
 235
 236
 237     /* (non-Javadoc)
 238      * @see eu.etaxonomy.cdm.database.IMassIndexer#reindex()
 239      */
 240     @Override
 241     public void reindex(IProgressMonitor monitor){
 242
 243         if(monitor == null){
 244             monitor = new NullProgressMonitor();
 245         }
 246
 247         monitor.setTaskName("CdmMassIndexer");
 248         int steps = indexedClasses().length + 1; // +1 for optimize
 249         monitor.beginTask("Reindexing " + indexedClasses().length + " classes", steps);
 250
 251         for(Class<? extends CdmBase> type : indexedClasses()){
 252             reindex(type, monitor);
 253             // clear the session after each class to free memory
 254             getSession().clear();
 255         }
 256
 257         optimize();
 258         monitor.worked(1);
 259
 260         monitor.done();
 261     }
 262
 263         @Override
 264         public void createDictionary(IProgressMonitor monitor) {
 265         if(monitor == null){
 266             monitor = new NullProgressMonitor();
 267         }
 268
 269         monitor.setTaskName("CdmMassIndexer_Dictionary");
 270         int steps = dictionaryClasses().length; // +1 for optimize
 271         monitor.beginTask("Creating Dictionary " + dictionaryClasses().length + " classes", steps);
 272
 273         for(Class type : dictionaryClasses()){
 274                 createDictionary(type, monitor);
 275         }
 276
 277         monitor.done();
 278
 279         }
 280     protected void optimize() {
 281
 282         FullTextSession fullTextSession = Search.getFullTextSession(getSession());
 283         fullTextSession.getSearchFactory().optimize();
 284         fullTextSession.flushToIndexes();
 285         fullTextSession.clear();
 286     }
 287
 288     /**
 289      * @return
 290      */
 291     private int totalBatchCount() {
 292         int totalNumOfBatches = 0;
 293         for(Class type : indexedClasses()){
 294             totalNumOfBatches += calculateNumOfBatches(countEntities(type));
 295         }
 296         return totalNumOfBatches;
 297     }
 298
 299     /* (non-Javadoc)
 300      * @see eu.etaxonomy.cdm.database.IMassIndexer#purge()
 301      */
 302     @Override
 303     public void purge(IProgressMonitor monitor){
 304
 305         if(monitor == null){
 306             monitor = new NullProgressMonitor();
 307         }
 308
 309         monitor.setTaskName("CdmMassIndexer");
 310         int steps = indexedClasses().length + 1; // +1 for optimize
 311         monitor.beginTask("Purging " + indexedClasses().length + " classes", steps);
 312
 313         for(Class<? extends CdmBase> type : indexedClasses()){
 314             purge(type, monitor);
 315             monitor.worked(1);
 316         }
 317         // need to flush to the index before optimizing
 318         // the purge method is not doing the flushing by itself
 319         FullTextSession fullTextSession = Search.getFullTextSession(getSession());
 320         fullTextSession.flushToIndexes();
 321
 322         // optimize
 323         optimize();
 324         monitor.worked(1);
 325
 326         // done
 327         monitor.done();
 328     }
 329
 330
 331     /**
 332      * Returns a list of declared indexable fields within a class through reflection.
 333      *
 334      * @param clazz
 335      * @return
 336      */
 337     private List<String> getIndexedDeclaredFields(Class clazz) {
 338         List<String> idFields = new ArrayList<String>();
 339         if(clazz.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
 340                 Field[] declaredFields = clazz.getDeclaredFields();
 341                 for(int i=0;i<declaredFields.length;i++ ) {
 342                         logger.info("checking field " + declaredFields[i].getName());
 343                         if(declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Field.class) ||
 344                                         declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Fields.class)) {
 345                                 idFields.add(declaredFields[i].getName());
 346                                 logger.info("adding field " + declaredFields[i].getName());
 347                         }
 348                 }
 349         }
 350         return idFields;
 351     }
 352     /**
 353      * @return
 354      */
 355     @SuppressWarnings("unchecked")
 356     @Override
 357     public Class<? extends CdmBase>[] indexedClasses() {
 358         return new Class[] {
 359                 DescriptionElementBase.class,
 360                 Classification.class,
 361                 TaxonBase.class,
 362                 TaxonNameBase.class,
 363                 SpecimenOrObservationBase.class
 364                 };
 365     }
 366
 367     /**
 368      * @return
 369      */
 370     @Override
 371     public Class[] dictionaryClasses() {
 372         return new Class[] {
 373                         NonViralName.class
 374                 };
 375     }
 376
 377
 378
 379
 380 }