fixing stack overflow due to incorrect usage of freetext sessions
[cdmlib.git] / cdmlib-services / src / main / java / eu / etaxonomy / cdm / api / service / search / CdmMassIndexer.java
1 // $Id$
2 /**
3 * Copyright (C) 2011 EDIT
4 * European Distributed Institute of Taxonomy
5 * http://www.e-taxonomy.eu
6 *
7 * The contents of this file are subject to the Mozilla Public License Version 1.1
8 * See LICENSE.TXT at the top of this package for the full license terms.
9 */
10 package eu.etaxonomy.cdm.api.service.search;
11
12 import java.io.IOException;
13 import java.lang.reflect.Field;
14 import java.util.ArrayList;
15 import java.util.Iterator;
16 import java.util.List;
17
18 import org.apache.log4j.Logger;
19 import org.apache.lucene.index.IndexReader;
20 import org.apache.lucene.index.IndexWriterConfig;
21 import org.apache.lucene.search.spell.Dictionary;
22 import org.apache.lucene.search.spell.LuceneDictionary;
23 import org.apache.lucene.search.spell.SpellChecker;
24 import org.apache.lucene.store.Directory;
25 import org.hibernate.CacheMode;
26 import org.hibernate.FlushMode;
27 import org.hibernate.ScrollMode;
28 import org.hibernate.ScrollableResults;
29 import org.hibernate.Session;
30 import org.hibernate.search.FullTextSession;
31 import org.hibernate.search.Search;
32 import org.hibernate.search.engine.spi.SearchFactoryImplementor;
33 import org.hibernate.search.indexes.impl.DirectoryBasedIndexManager;
34 import org.hibernate.search.indexes.spi.IndexManager;
35 import org.springframework.beans.factory.annotation.Autowired;
36 import org.springframework.orm.hibernate4.HibernateTransactionManager;
37 import org.springframework.stereotype.Component;
38 import org.springframework.transaction.PlatformTransactionManager;
39 import org.springframework.transaction.annotation.Transactional;
40
41 import eu.etaxonomy.cdm.common.monitor.IProgressMonitor;
42 import eu.etaxonomy.cdm.common.monitor.NullProgressMonitor;
43 import eu.etaxonomy.cdm.common.monitor.RestServiceProgressMonitor;
44 import eu.etaxonomy.cdm.common.monitor.SubProgressMonitor;
45 import eu.etaxonomy.cdm.config.Configuration;
46 import eu.etaxonomy.cdm.model.common.CdmBase;
47 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
48 import eu.etaxonomy.cdm.model.name.NonViralName;
49 import eu.etaxonomy.cdm.model.name.TaxonNameBase;
50 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
51 import eu.etaxonomy.cdm.model.taxon.Classification;
52 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
53
54 /**
55 * @author Andreas Kohlbecker
56 * @date Dec 15, 2011
57 *
58 */
59 @Component
60 @Transactional
61 public class CdmMassIndexer implements ICdmMassIndexer {
62
63 public static final Logger logger = Logger.getLogger(CdmMassIndexer.class);
64
65 private static final int BATCH_SIZE = 2000;
66
67 public HibernateTransactionManager transactionManager;
68
69 @Autowired
70 public void setTransactionManager(PlatformTransactionManager transactionManager) {
71 this.transactionManager = (HibernateTransactionManager)transactionManager;
72 }
73
74 protected Session getSession(){
75 Session session = transactionManager.getSessionFactory().getCurrentSession();
76 return session;
77 }
78
79 protected <T extends CdmBase>void reindex(Class<T> type, IProgressMonitor monitor) {
80
81 FullTextSession fullTextSession = Search.getFullTextSession(getSession());
82
83 fullTextSession.setFlushMode(FlushMode.MANUAL);
84 fullTextSession.setCacheMode(CacheMode.IGNORE);
85
86 logger.info("start indexing " + type.getName());
87 monitor.subTask("indexing " + type.getSimpleName());
88
89 Long countResult = countEntities(type);
90 int numOfBatches = calculateNumOfBatches(countResult);
91
92 SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
93 subMonitor.beginTask("Indexing " + type.getSimpleName(), numOfBatches);
94
95 // Scrollable results will avoid loading too many objects in memory
96 ScrollableResults results = fullTextSession.createCriteria(type).setFetchSize(BATCH_SIZE).scroll(ScrollMode.FORWARD_ONLY);
97 long index = 0;
98 int batchesWorked = 0;
99
100 try {
101 while (results.next()) {
102 index++;
103 fullTextSession.index(results.get(0)); // index each element
104 if (index % BATCH_SIZE == 0 || index == countResult) {
105 batchesWorked++;
106 fullTextSession.flushToIndexes(); // apply changes to indexes
107 fullTextSession.clear(); // clear since the queue is processed
108 subMonitor.worked(1);
109 logger.info("\tbatch " + batchesWorked + "/" + numOfBatches + " processed");
110 //if(index / BATCH_SIZE > 10 ) break;
111 }
112 }
113 } catch (RuntimeException e) {
114 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
115 monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
116 monitor.done();
117 throw e;
118 }
119 logger.info("end indexing " + type.getName());
120 subMonitor.done();
121 }
122
123 /**
124 *
125 *
126 * @param type
127 * @param monitor
128 */
129 protected <T extends CdmBase> void createDictionary(Class<T> type, IProgressMonitor monitor) {
130 String indexName = null;
131 if(type.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
132 indexName = type.getAnnotation(org.hibernate.search.annotations.Indexed.class).index();
133 } else {
134 //TODO:give some indication that this class is infact not indexed
135 return;
136 }
137 SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)Search.getFullTextSession(getSession()).getSearchFactory();
138 IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(indexName);
139 IndexReader indexReader = searchFactory.getIndexReaderAccessor().open(type);
140 List<String> idFields = getIndexedDeclaredFields(type);
141
142 monitor.subTask("creating dictionary " + type.getSimpleName());
143
144 SubProgressMonitor subMonitor = new SubProgressMonitor(monitor, 1);
145 subMonitor.beginTask("Creating dictionary " + type.getSimpleName(), 1);
146
147 Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
148 SpellChecker spellChecker = null;
149 try {
150 spellChecker = new SpellChecker(directory);
151 Iterator<String> itr = idFields.iterator();
152 while(itr.hasNext()) {
153 String indexedField = itr.next();
154 logger.info("creating dictionary for field " + indexedField);
155 Dictionary dictionary = new LuceneDictionary(indexReader, indexedField);
156 IndexWriterConfig iwc = new IndexWriterConfig(Configuration.luceneVersion, searchFactory.getAnalyzer(type));
157 spellChecker.indexDictionary(dictionary, iwc, true);
158 }
159 subMonitor.internalWorked(1);
160 } catch (IOException e) {
161 logger.error("IOException when creating dictionary", e);
162 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
163 monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
164 monitor.done();
165 } catch (RuntimeException e) {
166 logger.error("RuntimeException when creating dictionary", e);
167 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
168 monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
169 monitor.done();
170 } finally {
171 searchFactory.getIndexReaderAccessor().close(indexReader);
172 }
173 if (spellChecker != null) {
174 try {
175 logger.info("closing spellchecker ");
176 spellChecker.close();
177 } catch (IOException e) {
178 logger.error("IOException when closing spellchecker", e);
179 }
180 }
181
182 logger.info("end creating dictionary " + type.getName());
183 subMonitor.done();
184 }
185
186 /**
187 * @param countResult
188 * @return
189 */
190 private int calculateNumOfBatches(Long countResult) {
191 Long numOfBatches = countResult > 0 ? ((countResult-1)/BATCH_SIZE)+1 : 0;
192 return numOfBatches.intValue();
193 }
194
195 /**
196 * @param type
197 * @return
198 */
199 private <T> Long countEntities(Class<T> type) {
200 Object countResultObj = getSession().createQuery("select count(*) from " + type.getName()).uniqueResult();
201 Long countResult = (Long)countResultObj;
202 return countResult;
203 }
204
205 protected <T extends CdmBase>void purge(Class<T> type, IProgressMonitor monitor) {
206
207 FullTextSession fullTextSession = Search.getFullTextSession(getSession());
208 logger.info("purging " + type.getName());
209 fullTextSession.purgeAll(type);
210
211
212 SearchFactoryImplementor searchFactory = (SearchFactoryImplementor)fullTextSession.getSearchFactory();
213 IndexManager indexManager = searchFactory.getAllIndexesManager().getIndexManager(type.getName());
214 Directory directory = ((DirectoryBasedIndexManager) indexManager).getDirectoryProvider().getDirectory();
215 SpellChecker spellChecker = null;
216 try {
217 spellChecker = new SpellChecker(directory);
218 spellChecker.clearIndex();
219 } catch (IOException e) {
220 logger.error("IOException when creating dictionary", e);
221 //TODO better means to notify that the process has been stopped, using the STOPPED_WORK_INDICATOR is only a hack
222 monitor.worked(RestServiceProgressMonitor.STOPPED_WORK_INDICATOR);
223 monitor.done();
224 }
225
226 if (spellChecker != null) {
227 try {
228 logger.info("closing spellchecker ");
229 spellChecker.close();
230 } catch (IOException e) {
231 logger.error("IOException when closing spellchecker", e);
232 }
233 }
234 }
235
236
237 /* (non-Javadoc)
238 * @see eu.etaxonomy.cdm.database.IMassIndexer#reindex()
239 */
240 @Override
241 public void reindex(IProgressMonitor monitor){
242
243 if(monitor == null){
244 monitor = new NullProgressMonitor();
245 }
246
247 monitor.setTaskName("CdmMassIndexer");
248 int steps = indexedClasses().length + 1; // +1 for optimize
249 monitor.beginTask("Reindexing " + indexedClasses().length + " classes", steps);
250
251 for(Class<? extends CdmBase> type : indexedClasses()){
252 reindex(type, monitor);
253 // clear the session after each class to free memory
254 getSession().clear();
255 }
256
257 optimize();
258 monitor.worked(1);
259
260 monitor.done();
261 }
262
263 @Override
264 public void createDictionary(IProgressMonitor monitor) {
265 if(monitor == null){
266 monitor = new NullProgressMonitor();
267 }
268
269 monitor.setTaskName("CdmMassIndexer_Dictionary");
270 int steps = dictionaryClasses().length; // +1 for optimize
271 monitor.beginTask("Creating Dictionary " + dictionaryClasses().length + " classes", steps);
272
273 for(Class type : dictionaryClasses()){
274 createDictionary(type, monitor);
275 }
276
277 monitor.done();
278
279 }
280 protected void optimize() {
281
282 FullTextSession fullTextSession = Search.getFullTextSession(getSession());
283 fullTextSession.getSearchFactory().optimize();
284 fullTextSession.flushToIndexes();
285 fullTextSession.clear();
286 }
287
288 /**
289 * @return
290 */
291 private int totalBatchCount() {
292 int totalNumOfBatches = 0;
293 for(Class type : indexedClasses()){
294 totalNumOfBatches += calculateNumOfBatches(countEntities(type));
295 }
296 return totalNumOfBatches;
297 }
298
299 /* (non-Javadoc)
300 * @see eu.etaxonomy.cdm.database.IMassIndexer#purge()
301 */
302 @Override
303 public void purge(IProgressMonitor monitor){
304
305 if(monitor == null){
306 monitor = new NullProgressMonitor();
307 }
308
309 monitor.setTaskName("CdmMassIndexer");
310 int steps = indexedClasses().length + 1; // +1 for optimize
311 monitor.beginTask("Purging " + indexedClasses().length + " classes", steps);
312
313 for(Class<? extends CdmBase> type : indexedClasses()){
314 purge(type, monitor);
315 monitor.worked(1);
316 }
317 // need to flush to the index before optimizing
318 // the purge method is not doing the flushing by itself
319 FullTextSession fullTextSession = Search.getFullTextSession(getSession());
320 fullTextSession.flushToIndexes();
321
322 // optimize
323 optimize();
324 monitor.worked(1);
325
326 // done
327 monitor.done();
328 }
329
330
331 /**
332 * Returns a list of declared indexable fields within a class through reflection.
333 *
334 * @param clazz
335 * @return
336 */
337 private List<String> getIndexedDeclaredFields(Class clazz) {
338 List<String> idFields = new ArrayList<String>();
339 if(clazz.isAnnotationPresent(org.hibernate.search.annotations.Indexed.class)) {
340 Field[] declaredFields = clazz.getDeclaredFields();
341 for(int i=0;i<declaredFields.length;i++ ) {
342 logger.info("checking field " + declaredFields[i].getName());
343 if(declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Field.class) ||
344 declaredFields[i].isAnnotationPresent(org.hibernate.search.annotations.Fields.class)) {
345 idFields.add(declaredFields[i].getName());
346 logger.info("adding field " + declaredFields[i].getName());
347 }
348 }
349 }
350 return idFields;
351 }
352 /**
353 * @return
354 */
355 @SuppressWarnings("unchecked")
356 @Override
357 public Class<? extends CdmBase>[] indexedClasses() {
358 return new Class[] {
359 DescriptionElementBase.class,
360 Classification.class,
361 TaxonBase.class,
362 TaxonNameBase.class,
363 SpecimenOrObservationBase.class
364 };
365 }
366
367 /**
368 * @return
369 */
370 @Override
371 public Class[] dictionaryClasses() {
372 return new Class[] {
373 NonViralName.class
374 };
375 }
376
377
378
379
380 }